## Sentiment Analysis

In [1]:
import csv

In [2]:
with open('naver_review.csv', encoding='utf8') as f:
    w = csv.reader(f)
    next(w)
    reviews = list(w)

In [6]:
reviews[:10]

[['10', '관객에게 미끼를 던진 영화  '],
 ['10', '반전에반전...감독한테놀아난느낌  '],
 ['9', '나감독.. 숨은 쉬게해줘야지...  '],
 ['1',
  '...할말없는 영화임.본사람은 공감할듯..무섭고 잔인하고 징그럽고 소름끼치는 반전까지 고루 갖췄지만 그 모든걸 넘어서는 찝찝함.  '],
 ['10', '어이없어서 웃기고 그냥 보고난후 뭐지...?이느낌  '],
 ['8', '보는 내내 골룸 생각났음ㅋㅋ  '],
 ['9', '맨정신으로 작두에 올라 영화보는 기분  '],
 ['10', '정말로 곡소리 나오는 영화. 222  '],
 ['10', '현혹된 자만이 느낄수 있는 공포 그리고 후유증..  '],
 ['9',
  '긴장감만점 다만9점을준건 결말이 너무모호하다 개연성도 부족한거같고 천우희가도와준이유,황전민정체,할배가천우희쫓아뛰는이유 등등 감독한테직접물어보고싶은게많다  ']]

In [3]:
import numpy

In [4]:
tdm = numpy.load('tdm.npy').tolist()

In [7]:
with open('nouns.txt', encoding='utf8') as f:
    noun_list = f.read().splitlines()

In [8]:
from sklearn.cross_validation import train_test_split



In [9]:
stars = [int(r[0]) for r in reviews]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(tdm, stars, test_size=0.2, random_state=42)

In [11]:
from sklearn import linear_model

In [12]:
lm = linear_model.LinearRegression()

In [13]:
lm.fit (X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
import operator

In [15]:
def get_important_words(model, positive=True, n=8):
    return sorted(list(zip(noun_list, model.coef_)), key=operator.itemgetter(1), reverse=positive)[:n]

In [16]:
get_important_words(lm)

[('출연', 21.126842882548061),
 ('등장', 19.235296316984943),
 ('월광', 18.12293240550347),
 ('성도', 17.896875029243478),
 ('중심', 16.028112763828055),
 ('비교', 15.609078394650654),
 ('혼돈', 14.207428881649529),
 ('지릴뻔', 13.188009107513336)]

In [17]:
get_important_words(lm, False)

[('정체', -27.125871162746712),
 ('회수', -23.626440745128797),
 ('퀄리티', -19.124024581901331),
 ('흐트러진', -18.696989686843658),
 ('차지', -16.799749160666192),
 ('사탄', -15.082050211603269),
 ('당신', -14.072248942426102),
 ('개도', -13.796547312277804)]

In [18]:
# coef 너무 크게 나옴 - overfitting. 정규화 필요.

In [19]:
lm.score(X_train, y_train)

0.62483192619036609

In [20]:
lm.score(X_test, y_test)

-1.9796775019913648

## Lasso regression

In [22]:
lasso = linear_model.Lasso(alpha=0.01)

In [23]:
lasso.fit (X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
get_important_words(lasso)

[('현혹', 0.88713581696690991),
 ('소름', 0.84968514539862283),
 ('한국', 0.70021990178298676),
 ('완전', 0.67825442406668324),
 ('최고', 0.60862345479841662),
 ('한번', 0.60141433009348277),
 ('대박', 0.58340521981655247),
 ('나홍진', 0.54710065882888215)]

In [25]:
get_important_words(lasso, False)

[('쓰레기', -3.0157856495097417),
 ('최악', -2.8314911049859814),
 ('실망', -2.3545628264857945),
 ('진심', -2.1289169617709827),
 ('별로', -2.1019194392410894),
 ('평론가', -1.6022189468578882),
 ('스트레스', -1.5782762715387715),
 ('노잼', -1.4914868801606267)]

In [26]:
lasso.score(X_train, y_train)

0.22711072765475204

In [27]:
lasso.score(X_test, y_test)

0.16421563673695438

## LassoCV

In [28]:
# training data를 자동으로 cross validation하여 가장 성능이 좋은 alpha 선택하여 모델링함.

In [30]:
lassocv = linear_model.LassoCV()
lassocv.fit (X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [None]:
# Elastic Net - Lasso + Rigde 모두 적용됨.