In [26]:
import os
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
    
import IPython; from IPython.display import display, HTML
def dfPrint(df):
    display(HTML(df.to_html()))
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

In [21]:
def import_data():
    train_path = "./data/nlp_trip_advisor/train.csv"
    data = pd.read_csv(train_path)
    data = data[data.Is_Response.isnull() == False]
#     data['Is_Response'] = data['Is_Response'].map(int)
    data = data[data['Description'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape)
    data = pd.get_dummies(data, columns=["Is_Response"])
    data.drop(['Browser_Used', 'Device_Used', 'Is_Response_not happy'], axis=1, inplace=True)
    data.columns = ['User_ID', "Description", "Sentiment"]
    return data

data = import_data()
dfPrint(data.sample(5))

dataset loaded with shape (38932, 5)


Unnamed: 0,User_ID,Description,Sentiment
38348,id48674,I've stayed in quite a number of budget hotels...,0
36066,id46392,Modern hotel located conveniently near all ame...,1
31774,id42100,"This was a dream, after cancelling our reserva...",1
29539,id39865,I stayed here for a business trip and stayed e...,1
27314,id37640,Though the hotel is not located in downtown LA...,0


In [22]:
############### define documents ################
docs = data["Description"]
############# define class labels ###############
labels = data["Sentiment"]
################## split_data ##################
docs_train, docs_test, labels_train, labels_test = train_test_split(docs, labels, test_size=0.2, random_state=69)


In [23]:
############### vectorizing data ###############
vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True,stop_words='english')
train_tf_idf = vectorizer.fit_transform(docs_train) 
test_tf_idf = vectorizer.transform(docs_test)

In [54]:
############# defining momdels ##################
model1 = LinearSVC(dual=False)
############## fitting model ####################
model1.fit(train_tf_idf,labels_train)
############## calculating results #############
result1 = model1.predict(test_tf_idf)
############## evaluating model ################
score1 = accuracy_score(labels_test, result1, normalize=True, sample_weight=None)

print(score1)

0.868755618338


In [32]:
############# defining momdels ##################
model2 = MultinomialNB()
############## fitting model ####################
model2.fit(train_tf_idf,labels_train)
############## calculating results #############
result2 = model2.predict(test_tf_idf)
############## evaluating model ################
score2 = accuracy_score(labels_test, result2, normalize=True, sample_weight=None)

print(score2)

0.854115834082


In [53]:
############# defining momdels ##################
model3 = RandomForestClassifier(n_estimators=25,max_depth=50,min_samples_leaf=10, random_state=0)
############## fitting model ####################
model3.fit(train_tf_idf,labels_train)
############## calculating results #############
result3 = model3.predict(test_tf_idf)

############## evaluating model ################
score3 = accuracy_score(labels_test, result3, normalize=True, sample_weight=None)

print(score3)

0.813406960318
