In [46]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')


In [47]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt


In [48]:
combine=pd.concat([train, test]).reset_index(drop=True)
combine.tail()

Unnamed: 0,Browser_Used,Description,Device_Used,Is_Response,User_ID
68331,Chrome,I stayed at the hotel and towers for a confere...,Mobile,,id109531
68332,Internet Explorer,Trying to stay within the Marriott family and ...,Tablet,,id109532
68333,Edge,"We stayed for - nights with our little dog,ver...",Desktop,,id109533
68334,InternetExplorer,Stayed at the Yotel over the weekend and was v...,Desktop,,id109534
68335,Mozilla Firefox,The Blakely is is comfortable is every way: th...,Mobile,,id109535


In [49]:
combine.loc[combine["Browser_Used"]=="Google Chrome","Browser_Used"]=0
combine.loc[combine["Browser_Used"]=="Chrome","Browser_Used"]=0
combine.loc[combine["Browser_Used"]=="InternetExplorer","Browser_Used"]=1
combine.loc[combine["Browser_Used"]=="IE","Browser_Used"]=1
combine.loc[combine["Browser_Used"]=="Internet Explorer","Browser_Used"]=1
combine.loc[combine["Browser_Used"]=="Mozilla Firefox","Browser_Used"]=2
combine.loc[combine["Browser_Used"]=="Firefox","Browser_Used"]=2
combine.loc[combine["Browser_Used"]=="Mozilla","Browser_Used"]=2
combine.loc[combine["Browser_Used"]=="Edge","Browser_Used"]=3
combine.loc[combine["Browser_Used"]=="Safari","Browser_Used"]=4
combine.loc[combine["Browser_Used"]=="Opera","Browser_Used"]=5

combine["Device_Used"]=combine["Device_Used"].astype("category")
combine["Device_Used"].cat.categories=[0,1,2]

combine["Is_Response"]=combine["Is_Response"].astype("category")
combine["Is_Response"].cat.categories=[0,1]

In [50]:
combine['Description'] = combine['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)

In [52]:
bagofwords = countvec.fit_transform(combine['Description'])
tfidfdata = tfidfvec.fit_transform(combine['Description'])

In [53]:
from sklearn.preprocessing import LabelEncoder
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    combine[x] = lbl.fit_transform(combine[x])

In [54]:
combine.head()

Unnamed: 0,Browser_Used,Description,Device_Used,Is_Response,User_ID
0,3,room kind clean strong smell dog gener averag ...,1,1,id10326
1,1,stay crown plaza april april staff friendli at...,1,1,id10327
2,2,book hotel hotwir lowest price could find got ...,2,1,id10328
3,1,stay husband son way alaska cruis love hotel g...,0,0,id10329
4,3,girlfriend stay celebr th birthday plan weeken...,2,1,id10330


In [55]:
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [56]:
tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.140913,0.0,0.0,0.0,0.0,0.29807
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.092802,0.0,0.122793,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.101179,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196513,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.08328,0.0,0.096701,0.146196,0.0,0.0,0.0,0.0,0.0


In [57]:
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [59]:
tfidf_df.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col490,col491,col492,col493,col494,col495,col496,col497,col498,col499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.140913,0.0,0.0,0.0,0.0,0.29807
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.092802,0.0,0.122793,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.101179,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196513,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.08328,0.0,0.096701,0.146196,0.0,0.0,0.0,0.0,0.0


In [94]:
tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

train_features=combine.loc[:len(train)-1,cols]
test_features=combine.loc[len(train):,cols]

y_train=combine.loc[:len(train)-1,["Is_Response"]]

x_train=pd.concat([train_features,tfid_df_train],axis=1)
x_test=pd.concat([test_features,tfid_df_test],axis=1)

In [103]:
clf = MultinomialNB()
eq=clf.fit(x_train,np.ravel(y_train))
eq.score(x_train,np.ravel(y_train))

0.83846193362786392

In [104]:
prediction=eq.predict(x_test)

In [105]:
prediction

array([1, 0, 0, ..., 0, 0, 0])

In [8]:
description=train.loc[:,"Description"]
vec=TfidfVectorizer(stop_words='english')
print("Conveting document into tf-idf vector")
a=vec.fit_transform(description)
des=a.toarray()

Conveting document into tf-idf vector


In [9]:
des

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [10]:
brow=np.array(train.loc[:,"Browser_Used"])
dev=np.array(train.loc[:,"Device_Used"])
label=np.array(train.loc[:,"Is_Response"])

In [11]:
clf=MultinomialNB()
eq=clf.fit(des,label)

In [12]:
eq.score(des,label)

0.83851330525017975

In [14]:
test=pd.read_csv("test.csv")
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


In [16]:
description_test=test.loc[:,"Description"]
a_test=vec.transform(description_test)
des_test=a_test.toarray()

In [106]:
prediction=eq.predict(des_test)

NameError: name 'des_test' is not defined

In [107]:
prediction

array([1, 0, 0, ..., 0, 0, 0])

In [108]:
prediction=np.array(prediction )

In [109]:
loan_id=np.ravel(test.loc[:,['User_ID']])
ans=pd.DataFrame(prediction,loan_id,columns=['Is_Response'])

ans.loc[ans['Is_Response']==1,['Is_Response']]='not_happy'
ans.loc[ans['Is_Response']==0,['Is_Response']]='happy'

ans.to_csv('answer.csv',index_label=['User_ID'])