## 0. Configure Package Dependencies

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

# import nltk
# nltk.download()  # Download text data sets, including stop words

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## 1. Import the  Dataset

In [None]:
# "header=0" indicates that the first line of the file contains column names, 
# "delimiter=\t" indicates that the fields are separated by tabs, and 
# quoting=3 tells Python to ignore doubled quotes.
train = pd.read_csv("../input/labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv("../input/testData.tsv", header=0, delimiter='\t', quoting=3)

## 2. Preview the Dataset

### Training Data

In [None]:
train.head(5)

In [None]:
# Display the dimensions of the dataset.
rows = train.shape[0]
columns = train.shape[1]
feature_set = train.columns.values
print('Total Number of Features: ', columns)
print('Total Number of Instances: ', rows)
print('Feature Set includes: ', feature_set)

### Testing Data

In [None]:
test.head(5)

In [None]:
# Display the dimensions of the dataset.
rows = test.shape[0]
columns = test.shape[1]
feature_set = test.columns.values
print('Total Number of Features: ', columns)
print('Total Number of Instances: ', rows)
print('Feature Set includes: ', feature_set)

## 3. Data Cleaning and Text Preprocessing
- Removing **HTML Markup**: The BeautifulSoup Package
- Dealing with **Punctuation**, **Numbers** and **Stopwords**: NLTK and regular expressions
- Converting our reviews to lower case and split them into individual words: tokenization

In [None]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #    
    # 1. Remove HTML markup.
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters.     
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set.
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 

In [None]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

print("Cleaning and parsing the training set movie reviews....")

# Initialize an empty list to hold the clean reviews.
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length of the movie review list.
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message.
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d\n" % ( i+1, num_reviews ) ) 
    # Call our function for each one, and add the result to the list of clean reviews.
    clean_train_reviews.append( review_to_words( train["review"][i] ))

In [None]:
# Get the number of reviews based on the dataframe column size
num_reviews = test["review"].size

print("Cleaning and parsing the testing set movie reviews....")

# Initialize an empty list to hold the clean reviews.
clean_test_reviews = []

# Loop over each review; create an index i that goes from 0 to the length of the movie review list.
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message.
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d\n" % ( i+1, num_reviews ) ) 
    # Call our function for each one, and add the result to the list of clean reviews.
    clean_test_reviews.append( review_to_words( test["review"][i] ))

## 4. Feature Processing
- Bag of Words
- TF-IDF
- Word2vec

### 4.1 Bag of Words + Random Forest

### 4.1.1 Creating Features from a Bag of Words

In [None]:
print("Creating the bag of words....")

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an array.
train_data_features = train_data_features.toarray()

vocab = vectorizer.get_feature_names()

# Sum up the counts of each vocabulary word.
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print(count, tag)

### 4.1.2 Random Forest based on BoW

In [None]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["sentiment"] )

### 4.1.3 Creating a Submission with 4.1.2 model

In [None]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

### 4.2 TF-IDF + Naive Bayes

### 4.2.1 Creating Features from a TF-IDF

In [None]:
print("Creating the TF-IDF....")

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the "TfidfVectorizer" object, which is scikit-learn's TF-IDF tool.  
tfidf = TfidfVectorizer(min_df = 2,
                        max_features = None,
                        strip_accents = 'unicode',
                        analyzer = 'word',
                        token_pattern = r'\w{1,}',
                        ngram_range = (1, 3), 
                        use_idf = 1,
                        smooth_idf = 1,
                        sublinear_tf = 1,
                        stop_words = 'english') 

# Merge traning and testing data in order to vectorize.
clean_all_reviews = clean_train_reviews + clean_test_reviews
train_index = len(clean_train_reviews)

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings.
all_data_features = tfidf.fit_transform(clean_all_reviews)

# Recover training and testing data.
train_data_features = all_data_features[:train_index]
test_data_features = all_data_features[train_index:]

vocab = tfidf.get_feature_names()

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print(count, tag)

### 4.2.2 Naive Bayes based on TF-IDF

In [None]:
print("Training the Naive Bayes...")
from sklearn.naive_bayes import MultinomialNB

# Initialize a Random Forest classifier with 100 trees
model_NB = MultinomialNB() 

from sklearn.model_selection import cross_val_score
print("Score of 10-fold CV: ", np.mean(cross_val_score(model_NB, train_data_features, train["sentiment"], cv=10, scoring='roc_auc')))

# Fit the naive bayes to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
model_NB = model_NB.fit(train_data_features, train["sentiment"])

### 4.2.3 Creating a Submission with 4.2.2 model

In [None]:
# Use the random forest to make sentiment label predictions
result = model_NB.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "TF_IDF_NAIVE_BAYES_model.csv", index=False, quoting=3 )

### 4.3 TF-IDF + Logistic Regression

### 4.3.2 Logistic Regression based on TF-IDF

In [None]:
print("Training the Logistic Regression...")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# 
grid_values = {'C': [1, 15, 30, 50]}  
model_LR = GridSearchCV(
    LogisticRegression(penalty='l2', dual=True, random_state=0), 
    grid_values, 
    scoring='roc_auc', 
    cv=20)

model_LR = model_LR.fit(train_data_features, train["sentiment"])
print(model_LR.cv_results_, '\n', model_LR.best_params_, model_LR.best_score_)

# Fit the logistic regression to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
# model_LR = LogisticRegression(penalty='l2', dual=True, random_state=0)
# model_LR = model_LR.fit(train_data_features, train["sentiment"])

### 4.3.3 Creating a Submission with 4.3.2 model

In [None]:
# Use the logistic regression to make sentiment label predictions
result = model_LR.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "TF_IDF_LR_model.csv", index=False, quoting=3 )