## Import Data Into Data Frame

In [1]:
import pandas as pd
data=pd.read_csv('review_restaurant.csv')
data.shape


(1630712, 9)

In [2]:
data=data.sample(frac=0.25)
data.shape

(407678, 9)

In [3]:
from sklearn.cross_validation import train_test_split

X = data['text']
Y = data['Label']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)



## Binary Representation:

To create a matrix that uses each word as a feature and keeps track of whether or not a word appears in a review text

In [4]:
from sklearn.feature_extraction.text import CountVectorizer


In [5]:
binary_vectorizer=CountVectorizer(binary=True)
binary_vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
X_train_binary = binary_vectorizer.transform(X_train)
X_test_binary = binary_vectorizer.transform(X_test)

## Linear Regression Model

In [7]:
from sklearn.linear_model import LogisticRegression
LRmodel=LogisticRegression()
LRmodel.fit(X_train_binary, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
from sklearn import metrics
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(LRmodel.predict(X_test_binary),Y_test))

Area under the ROC curve on the test data=0.632


## Naive Bayes Model

In [12]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

NBmodel=BernoulliNB()
NBmodel.fit(X_train_binary, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [13]:
from sklearn import metrics
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(NBmodel.predict(X_test_binary),Y_test))

Area under the ROC curve on the test data=0.641


## TF-idf Representation:


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tfidf_vectorizer=TfidfVectorizer()
tfidf_vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [18]:
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
LRmodel_tfidf=LogisticRegression()
LRmodel_tfidf.fit(X_train_tfidf, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
from sklearn import metrics
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(LRmodel_tfidf.predict(X_test_tfidf),Y_test))

Area under the ROC curve on the test data=0.646


## Naive Bayes

In [19]:
NBmodel_tfidf=BernoulliNB()
NBmodel_tfidf.fit(X_train_tfidf, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [20]:
from sklearn import metrics
print("Area under the ROC curve on the test data=%.3f"% metrics.roc_auc_score(NBmodel_tfidf.predict(X_test_tfidf),Y_test))

Area under the ROC curve on the test data=0.641
