## 1. 패키지 다운

In [11]:
import pandas as pd
review_df=pd.read_csv("labeledTrainData.tsv", header=0, sep="\t", quoting=3)
review_df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [10]:
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

## 2. 전처리

In [2]:
import re

# HTML태그가 담겨있음을 확인. 이는 크롤링을 통해서 가져올 때 HTML을 이용하기 때문에
# 분석에 필요없는 HTML을 제거해준다.
review_df['review'] = review_df['review'].str.replace('<br />',' ')

# 특수문자에 대해서도 공백으로 바꾸는 클렌징 작업을 진행
review_df['review'] = review_df['review'].apply(lambda x : re.sub("[^a-zA-Z]"," ",x))

## 3. train/test 분리

In [3]:
from sklearn.model_selection import train_test_split

# y컬럼만 따로저장
class_df = review_df['sentiment']

# 'id'와 'sentiment'를 제거하고 x컬럼만 따로 저장
feature_df = review_df.drop(['id','sentiment'], axis = 1, inplace=False)

# train/test분리
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size = 0.3, random_state = 156)

Unnamed: 0,review
3724,This version moved a little slow for my taste...
23599,I really enjoyed this film because I have a t...
11331,Saw this in the theater in and fell out o...
15745,Recently I was looking for the newly issued W...
845,Escaping the life of being pimped by her fath...
...,...
6955,This is a generally nice film with good stor...
7653,The real shame of The Gathering is not in...
9634,In what could have been an otherwise run of t...
6860,Excellent P O W adventure adapted by Eric W...


## 4. 패키지 임포트

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

## 5. count방법

In [5]:
pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words = 'english', ngram_range=(1,2))),
    ('lr_clf',LogisticRegression(C = 10))])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])

pred_probs = pipeline.predict_proba(X_test['review'])[:,1]
print(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_probs))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.886 0.9502704586842703


## 6. TF-IDF 방법

In [7]:
pipeline = Pipeline([
    ('tfidf_vect',TfidfVectorizer(stop_words = 'english', ngram_range=(1,2)) ),
    ('lr_clf',LogisticRegression(C= 10))])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])

pred_probas = pipeline.predict_proba(X_test['review'])[:,1]

print(accuracy_score(y_test, pred), roc_auc_score(y_test,pred_probas))

0.8936 0.959799823582973
