# NAVER 영화 평점 예측

 * Reading Dataset 
 * Text to vector with Tfid 
 * Model fitting & validtaion check 
 * Predict test data
 * Final model
 * Predict submission data 

In [2]:
import sys
import random
import pickle

import pandas as pd
import numpy as np
import json
from konlpy.tag import Twitter
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB , GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Reading Dataset

In [3]:
# read train data
with open('./data/train_data.json') as fp:
    json_str = fp.read()
    json_data = json.loads(json_str)
    
# convert to dataframe
train_df = pd.DataFrame(json_data)

# train data preprocessing
train_df['rating_cat'] = train_df['rating'].apply(lambda x: 
                    'NEG' if 1<= x <=3 
                     else 
                      ('NEU' if 4<=x<=7 
                     else 'POS'))

# read test data in/out
with open('./data/test.input') as fp:
    test_in = fp.read()
    test_in = test_in.splitlines()

with open('./data/test.output') as fp:
    test_out = fp.read()
    test_out = test_out.splitlines()


In [None]:
train_df.head()

In [None]:
test_in.head()

###  Converting text to vector with Tfid 

In [4]:
# vectorize train using Tfid(bag of words)

twitter = Twitter()

def tokenize_pos(doc):
    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

vectorizer =  TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2), use_idf=False, smooth_idf=False)


y = train_df.rating_cat
X = vectorizer.fit_transform(train_df.review)

### Model fitting & validtaion check

In [8]:
# fitting train data with classifier

model = SGDClassifier(alpha=1.9e-6, n_iter=19).fit(X, y)



# predict test data
feature_list = vectorizer.get_feature_names()

test_vectorizer =  TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2), vocabulary = feature_list)
X_test = test_vectorizer.fit_transform(test_in[:8400])


test_pred = model.predict(X_test)

# accuracy
print(accuracy_score(test_out[:8400], test_pred))

0.776547619048


In [10]:
# optimizing alpha 

alpha = np.arange(2.5e-6,3.5e-6,1e-7)

ite = 5 #  반복
score = np.zeros([len(alpha), ite])

for i, val in enumerate(alpha):
    for k in range(ite): 
        model = SGDClassifier(alpha=val).fit(X, y)
        test_pred = model.predict(X_test)

        score[i][k] = accuracy_score(test_out[:8400], test_pred)
        
print(score)
print(score.mean(axis=1))

[[ 0.77178571  0.77535714  0.775       0.7747619   0.77309524]
 [ 0.77535714  0.77488095  0.77428571  0.775       0.77452381]
 [ 0.77333333  0.77321429  0.77464286  0.7747619   0.77547619]
 [ 0.77607143  0.77202381  0.77404762  0.77345238  0.77357143]
 [ 0.77285714  0.77345238  0.77440476  0.77404762  0.77511905]
 [ 0.77607143  0.77452381  0.77369048  0.77464286  0.77392857]
 [ 0.77559524  0.77535714  0.77416667  0.77416667  0.77452381]
 [ 0.77333333  0.77369048  0.77488095  0.77238095  0.77559524]
 [ 0.77488095  0.77285714  0.77404762  0.77380952  0.77630952]
 [ 0.775       0.77380952  0.77607143  0.77404762  0.7752381 ]]
[ 0.774       0.77480952  0.77428571  0.77383333  0.77397619  0.77457143
  0.7747619   0.77397619  0.77438095  0.77483333]


In [11]:
# optimizing n-iter 

n = np.arange(15,25)


ite = 5 #  반복
score = np.zeros([len(n), ite])


for i, val in enumerate(n):
    for k in range(ite): 
        
        model = SGDClassifier(alpha=2.9e-6, n_iter=val).fit(X, y)
        test_pred = model.predict(X_test)

        score[i][k] = accuracy_score(test_out[:8400], test_pred)
        
print(score)
print(score.mean(axis=1))

[[ 0.77559524  0.7752381   0.77428571  0.77428571  0.77380952]
 [ 0.77583333  0.775       0.77452381  0.77464286  0.775     ]
 [ 0.77416667  0.77559524  0.7752381   0.77452381  0.77464286]
 [ 0.77440476  0.77571429  0.77309524  0.77404762  0.77559524]
 [ 0.77392857  0.77547619  0.77464286  0.77452381  0.77452381]
 [ 0.77440476  0.7747619   0.7747619   0.77452381  0.77392857]
 [ 0.7747619   0.7752381   0.7747619   0.77452381  0.77595238]
 [ 0.77571429  0.7752381   0.77464286  0.77404762  0.77488095]
 [ 0.77535714  0.77511905  0.77428571  0.77511905  0.77511905]
 [ 0.77547619  0.77428571  0.77511905  0.775       0.77392857]]
[ 0.77464286  0.775       0.77483333  0.77457143  0.77461905  0.77447619
  0.77504762  0.77490476  0.775       0.7747619 ]


### Final model

In [12]:
model = SGDClassifier(alpha=2.9e-6, n_iter=21).fit(X, y)

### Predict submission data

In [13]:
# submission

with open('./data/grading.input') as fp:
    sbm_in = fp.read()
    sbm_in = sbm_in.splitlines()

feature_list = vectorizer.get_feature_names()

test_vectorizer =  TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2), vocabulary = feature_list)
sbm_test = test_vectorizer.fit_transform(sbm_in[:8400])

sbm_pred = model.predict(sbm_test)

output_file = 'submission.txt'


with open(output_file, 'w') as fp:
    for _test_pred in sbm_pred:
        fp.write(_test_pred)
        fp.write('\n')