## Import Data Into Data Frame

In [1]:
import pandas as pd
train_df=pd.read_csv('final_dummies_training.csv',index_col=0)
validate_df=pd.read_csv('final_dummies_evaluation.csv', index_col=0)


In [2]:
X_train=train_df['text']
X_test=validate_df['text']

In [3]:
Y_train=train_df['Label']
Y_test=validate_df['Label']

# Binary Representation:

To create a matrix that uses each word as a feature and keeps track of whether or not a word appears in a review text

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
binary_vectorizer=CountVectorizer(binary=True)
binary_vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
X_train_binary = binary_vectorizer.transform(X_train)
X_test_binary = binary_vectorizer.transform(X_test)

## Logistic Regression Model

In [7]:
from sklearn.linear_model import LogisticRegression
LRmodel=LogisticRegression()
LRmodel.fit(X_train_binary, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
from sklearn import metrics
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(LRmodel.predict(X_test_binary),Y_test))

Area under the ROC curve on the test data=0.644


## Naive Bayes Model

In [9]:
from sklearn.naive_bayes import BernoulliNB
NBmodel=BernoulliNB()
NBmodel.fit(X_train_binary, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [10]:
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(NBmodel.predict(X_test_binary),Y_test))

Area under the ROC curve on the test data=0.640


# Count Representation:

In [11]:
count_vectorizer=CountVectorizer()
count_vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
X_train_count = count_vectorizer.transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

## Logistic Regression

In [13]:
LRmodel_count=LogisticRegression()
LRmodel_count.fit(X_train_count, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(LRmodel_count.predict(X_test_count),Y_test))

Area under the ROC curve on the test data=0.655


## Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB
NBmodel_count=MultinomialNB()
NBmodel_count.fit(X_train_count, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(NBmodel_count.predict(X_test_count),Y_test))

Area under the ROC curve on the test data=0.629


# TF-idf Representation:


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf_vectorizer=TfidfVectorizer()
tfidf_vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [19]:
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Logistic Regression

In [20]:
LRmodel_tfidf=LogisticRegression()
LRmodel_tfidf.fit(X_train_tfidf, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(LRmodel_tfidf.predict(X_test_tfidf),Y_test))

Area under the ROC curve on the test data=0.651


## Naive Bayes

In [22]:
NBmodel_tfidf=BernoulliNB()
NBmodel_tfidf.fit(X_train_tfidf, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [23]:
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(NBmodel_tfidf.predict(X_test_tfidf),Y_test))

Area under the ROC curve on the test data=0.640
