In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../static/data/IMDB/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [3]:
df.review[0][:1000]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [4]:
df['review'] = df.review.str.replace('<br />', ' ')

In [5]:
# 영어 이외의 문자는 공백으로 변환(숫자 등)
# 앞에 들어가는 ^는 not의 의미 -> a-z, A-Z가 '아닌 놈들은' 공백으로 바꿔라
import re

df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [6]:
from sklearn.model_selection import train_test_split

feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(
    feature_df, df.sentiment, test_size =0.3, random_state=156
)

X_train.shape, X_test.shape

((17500, 1), (7500, 1))

In [14]:
X_train.shape, y_train.shape

((17500, 1), (17500,))

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [8]:
count_vect = CountVectorizer(stop_words='english', ngram_range=(1,2))
count_vect.fit(X_train.review)
X_train_count = count_vect.transform(X_train.review)
X_test_count = count_vect.transform(X_test.review)

lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_count, y_train)
pred = lr_clf.predict(X_test_count)
accuracy_score(y_test, pred)

0.886

In [9]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_vect.fit(X_train.review)
X_train_tfidf = tfidf_vect.transform(X_train.review)
X_test_tfidf = tfidf_vect.transform(X_test.review)

lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf, y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test, pred)

0.8936

count벡터

In [11]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])

In [22]:
from sklearn.model_selection import GridSearchCV

params = {
    'count_vect__max_df': [1000, 1100, 1200, 1300],
    'lr_clf__C': [1, 2, 3, 5]
}

grid_pipe_count = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe_count.fit(X_train.review, y_train)
print(grid_pipe_count.best_params_, grid_pipe_count.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 18.7min finished
{'count_vect__max_df': 1100, 'lr_clf__C': 1} 0.878686059596177


In [18]:
pipeline2 = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])

In [24]:
from sklearn.model_selection import GridSearchCV

params2 = {
    'tfidf_vect__max_df': [1200, 1250, 1300],
    'lr_clf__C': [35, 40, 45, 50]
}

grid_pipe_tfidf = GridSearchCV(pipeline2, param_grid=params2, cv=3, scoring='accuracy', verbose=1)
grid_pipe_tfidf.fit(X_train.review, y_train)
print(grid_pipe_tfidf.best_params_, grid_pipe_tfidf.best_score_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 12.0min finished
{'lr_clf__C': 45, 'tfidf_vect__max_df': 1250} 0.8895430392682413


In [25]:
import joblib
joblib.dump(grid_pipe_count, '../static/model/IMDB_countlr.pkl')
joblib.dump(grid_pipe_tfidf, '../static/model/IMDB_tfidflr.pkl')

['../static/model/IMDB_tfidflr.pkl']

In [27]:
review = "In a nutshell: Personality-less, odd looking ginger girl drugs, drinks, hallucinates and humps her way to the top of the chess world.

The film is loaded with stereotypes that the producers obviously thought defined the mid 1960s - bad marriages, sexist guys, alcoholic middle class housewives, anal retentive government officials, mean store clerks, stiff as wood Russian chess champions, libertine bisexual French girls, mean caretakers at the orphanage, and a hip and radical, down for the struggle black girl who was her BFF in the orphanage. Kudos to the wardrobe consultant and set decorators though. They nailed it with the mid century modern clothes and furnishing."

In [30]:
review_list = []
review_list.append(review)

In [28]:
df

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,With all this stuff going down at the moment ...
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...
3,"""3630_4""",0,It must be assumed that those who praised thi...
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...
...,...,...,...
24995,"""3453_3""",0,It seems like more consideration has gone int...
24996,"""5064_1""",0,I don t believe they made this film Complete...
24997,"""10905_3""",0,Guy is a loser Can t get girls needs to bui...
24998,"""10194_3""",0,This minute documentary Bu uel made in the...


In [34]:
pred = grid_pipe_count.predict(review_list)
pred2 = grid_pipe_tfidf.predict(review_list)

In [36]:
print(pred[0], pred2[0])

1 1


In [37]:
index = 100

In [40]:
test_data = df.iloc[index, -1]

In [44]:
test_data

' There is a uk edition to this show which is rather less extravagant than the US version  The person concerned will get a new kitchen or perhaps bedroom and bathroom and is wonderfully grateful for what they have got  The US version of this show is everything that reality TV shouldn t be  Instead of making a few improvements to a house which the occupants could not afford or do themselves the entire house gets rebuilt  I do not know if this show is trying to show what a lousy welfare system exists in the US or if you beg hard enough you will receive  The rather vulgar product placement that takes place  particularly by Sears  is also uncalled for  Rsther than turning one family in a deprived area into potential millionaires  it would be far better to help the community as a whole where instead of spending the hundreds of thousands of dollars on one home  build something for the whole community       perhaps a place where diy and power tools can be borrowed and returned along with buil

In [41]:
test = []
test.append(test_data)

In [42]:
pred = grid_pipe_count.predict(test)
pred2 = grid_pipe_tfidf.predict(test)

In [43]:
print(pred[0], pred2[0])

0 0


In [45]:
df = pd.read_csv('../static/data/IMDB/testData.tsv', header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."


In [47]:
test_data = df.iloc[index, -1]

In [48]:
test = []
test.append(test_data)

In [49]:
pred = grid_pipe_count.predict(test)
pred2 = grid_pipe_tfidf.predict(test)

In [50]:
print(pred[0], pred2[0])

1 1


In [51]:
if pred[0] == 1:
    print('긍정')
else:
    print('부정')

긍정
