In [1]:
# coding: utf-8
import pandas as pd
import os
from lxml import etree
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

In [2]:
path = "E:\kaggle\movies"
t_set_df = pd.read_csv(os.path.join(path,"labeledTrainData.tsv"), header=0, sep='\t')
test_df = pd.read_csv(os.path.join(path,"testData.tsv"), header=0, sep='\t')
t_set_pre = t_set_df['review']
test_pre = test_df['review']
t_set = []
test = []
t_label = t_set_df['sentiment']

In [3]:
t_set_pre[0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [4]:
# 去掉标点和一些奇怪字符，全部变小写，空格隔开，成为干净的wordlist
def review2wordlist(review):
    html = etree.HTML(review, etree.HTMLParser())
    review = html.xpath('string(.)').strip()
    review = re.sub("[^a-zA-Z]", " ", review)
    wordlist = review.lower().split()
    return wordlist

In [5]:
for i in range(len(t_set_pre)):
    words = review2wordlist(t_set_pre[i])
    t_set.append(" ".join(words))
for i in range(len(test_pre)):
    words = review2wordlist(test_pre[i])
    test.append(" ".join(words))

In [6]:
#vectorize sentences with words' TF-IDF value
all_x = t_set+test
tfv = TFIV(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')
tfv.fit(all_x)
all_x = tfv.transform(all_x)  # 稀疏矩阵，二维，(doc,words) -> tf-idf 每个
train_len = len(t_set)
x_train = all_x[:train_len] #<25000x309819 sparse matrix of type '<class 'numpy.float64'>'with 3429925 stored elements in Compressed Sparse Row format>
x_test = all_x[train_len:]

In [7]:
corpus = [
     'This is the first lovely document.',
     'This document is the good document.',
     'And this is the third bad one.',
     'Is this the first lazy document?']
temp_tfv=TFIV(stop_words='english')
x=temp_tfv.fit_transform(corpus)
print(temp_tfv.get_feature_names())
xa=x.A
xa

['bad', 'document', 'good', 'lazy', 'lovely']


array([[0.        , 0.53802897, 0.        , 0.        , 0.84292635],
       [0.        , 0.78722298, 0.61666846, 0.        , 0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.53802897, 0.        , 0.84292635, 0.        ]])

In [8]:
# y_train = t_set_df['sentiment']
# lr = LogisticRegression(C=30)
# grid_value = {'solver':['sag','liblinear','lbfgs']}
# model_lr = GridSearchCV(lr, cv=20, scoring='roc_auc', param_grid=grid_value)
# model_lr.fit(x_train, y_train)

# print(model_lr.cv_results_)  #the best score is 0.96462 with sag

In [9]:
y_train = t_set_df['sentiment']
model_lr = LogisticRegression(C=30)
model_lr.fit(x_train,y_train)
# model_1: logist regression


#model_2: naive bayes
model_nb = MultinomialNB()
model_nb.fit(x_train, y_train)
print("naive bayes score: ", np.mean(cross_val_score(model_nb, x_train, y_train, cv=20, scoring='roc_auc')))  #0.94963712


#model_3: SGDClassifier (SVM with linear knernel)
model_sgd = SGDClassifier(loss='modified_huber')
model_sgd.fit(x_train, y_train)
print("SGD score: ", np.mean(cross_val_score(model_sgd, x_train, y_train, cv=20, scoring='roc_auc'))) #0.964716288


# write the result to csv
lr_result = model_lr.predict(x_test)
lr_df = pd.DataFrame({'id':test_df['id'], 'sentiment':lr_result})
lr_df.to_csv(os.path.join(path,"LR_result.csv"), index=False)

nb_result = model_nb.predict(x_test)
nb_df = pd.DataFrame({'id':test_df['id'],'sentiment':nb_result})
nb_df.to_csv(os.path.join(path,"NB_result.csv"), index=False)

sgd_result = model_nb.predict(x_test)
sgd_df = pd.DataFrame({'id':test_df['id'],'sentiment':sgd_result})
sgd_df.to_csv(os.path.join(path,"SGD_result.csv"), index=False)



naive bayes score:  0.94963712




SGD score:  0.9646910719999999


In [10]:
model_svc = LinearSVC()
model_svc.fit(x_train,y_train)
svc_res = model_svc.predict(x_test)
svc_df = pd.DataFrame({'id':test_df['id'],'sentiment':svc_res})
svc_df.to_csv(os.path.join(path,"SVC_result.csv"), index=False)

In [17]:
fusion_res=svc_res+svc_res+lr_result+nb_result+sgd_result
fusion_res=np.array(fusion_res>=3,dtype=np.int)

In [19]:
fusion_df=pd.DataFrame({'id':test_df['id'],'sentiment':svc_res})
fusion_df.to_csv(os.path.join(path,"fusion_result.csv"),index=False)