### IMDB 영화평 감성분석(이진분류)

- CountVectorizer + LogisticRegression

In [1]:
import numpy as np 
import pandas as pd

##### 1. 데이터 탐색

In [2]:
df = pd.read_csv('data/labeledTrainData.tsv' , sep='\t')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
# 올바르게 읽는 방법
df = pd.read_csv('data/labeledTrainData.tsv' , sep='\t', quoting=3) # 3: Quote None 1번행에서 rivew분 슬래쉬부분이 바뀜 인용구를(따옴표)를 무시하게 해준다
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [5]:
# y값이 sentiment 
# review를 data로 다룸

In [10]:
print(df.review[0][:1000])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [7]:
df.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

##### 2. 텍스트 전처리

In [11]:
# '<br /> 태그 공백으로
df.review = df.review.str.replace('<br />', ' ')

In [14]:
# 구둣점, 숫자 제거 --> 영어 이외의 문자는 공백으로 변환
df.review = df.review.str.replace('[^A-Za-z]', ' ',regex=True) # regex=True 넣어줘야 변환적용됨

In [15]:
df.review[0][:200]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want '

##### 3. 데이터 셋 분리

In [16]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(
    df.review.values, df.sentiment.values, stratify=df.sentiment.values,
    test_size=0.2, random_state=2023
)
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([10000, 10000], dtype=int64))

##### 4. Text Encoding

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(stop_words='english')

In [19]:
# 아래와 같은 방법으로 하면 안됨
cvect.fit_transform(X_train).shape, cvect.fit_transform(X_test).shape # shape의 수량이 달라지면 인덱스가 달라지기 때문에 하면됨..

((20000, 66602), (5000, 37763))

In [None]:
# fit 하고 transform을 분리해야함  이유는
# 동일한 단어가 다른 숫자로 매핑이 되기때문.. 엉뚱한 결과가 나올수도 있기에 컴퓨터는 숫자만을 보니까 
# train에서 학습한것을 test에 적용을 해야함.. 
# train셋에 없는 단어는 버림...대세지장이 없다...

In [20]:
# 이와 같은 방법을 사용해야 함
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train) # 텍스트 데이터의 단어 빈도수를 벡터 형태로 표현한 것
X_test_cv = cvect.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((20000, 66602), (5000, 66602))

##### 5.학습 및 평가

In [23]:
from sklearn.linear_model import LogisticRegression 
lrc = LogisticRegression(random_state=2023, max_iter=500)

In [24]:
# 시간이 오래 걸리는 작업 - %time magic 명령어 사용
%time lrc.fit(X_train_cv, y_train)

CPU times: total: 5.09 s
Wall time: 5.42 s


In [25]:
lrc.score(X_test_cv, y_test)

0.8786

##### 6. Bigram

In [None]:
# n-gram은 텍스트에서 연속된 단어의 시퀀스를 나타내는 방법
# (1,1)은 단일 단어(uni-gram)만 고려하고, (1,2)는 단어의 연속된 두 개의 시퀀스(bi-gram)까지 고려한다는 의미
# ngram_range 매개변수를 (1,2)로 설정하여 유니그램(unigram, 단일 단어)과 바이그램(bigram, 연속된 두 단어의 쌍)을 모두 고려
# 단일 단어와 단어의 연속된 두 개의 시퀀스까지 고려

In [26]:
cvect2 = CountVectorizer(stop_words='english', ngram_range=(1,2))
cvect2.fit(X_train)
X_train_cv2 = cvect2.transform(X_train)
X_test_cv2 = cvect2.transform(X_test)
X_train_cv2.shape, X_test_cv2.shape

((20000, 1455899), (5000, 1455899))

In [27]:
lrc2 = LogisticRegression(random_state=2023, max_iter=500) # max_iter=500는 최적화 알고리즘이 수렴하는 최대 반복 횟수
%time lrc2.fit(X_train_cv2, y_train)

CPU times: total: 57.1 s
Wall time: 49.8 s


In [29]:
lrc2.score(X_test_cv2, y_test)

0.8896

##### 7. 모델 save/load

In [30]:
import joblib 

In [31]:
# 모델 저장 ngram-range1,2,
joblib.dump(cvect2, 'model/imdb_cvect_2.pkl')
joblib.dump(lrc2, 'model/imdb_lrc2.pkl')

['model/imdb_lrc2.pkl']

In [32]:
# 모델 로드
new_cvect = joblib.load('model/imdb_cvect_2.pkl')
new_lrc = joblib.load('model/imdb_lrc2.pkl')

##### 8. 실제 데이터로 검증

In [72]:
review = ['''I stayed away from this film for a long time, doing a dumb thing: listening to the well-known film critics.
When I finally got around to it, I was very surprised. 
It was a good film. Not great, not intense as the first two Godfather flicks, but definitely a lot better than advertised.''', '''Having just watched 
The Godfather and Godfather II again after watching and enjoying it so many times, 
I decided to watch this 3rd and last installment again to see if it was really as bad as I remembered. It is. 
The actors are so stiff, so ridiculous, and the script is so idiotic it's almost like a parody of the first two movies. 
If you are a fan of the original Godfather and Godfather II do yourself a favor and don't spoil your memory of those two great movies 
by watching this trash.'''
]

In [73]:
# 텍스트 전처리
import re 
# review = re.sub('[^A-Za-z]', ' ',review)
review = map(lambda x: re.sub('[^A-Za-z]', ' ',x), review)
# review 리스트의 각 요소에 대해 알파벳 이외의 문자를 공백으로 치환

In [74]:
# feature 변환
# review_cv = new_cvect.transform([review])
review_cv = new_cvect.transform(review)
review_cv.shape

(2, 1455899)

In [75]:
# 예측 
new_lrc.predict(review_cv)

array([1, 0], dtype=int64)

In [76]:
new_lrc.predict(review_cv)[0] # 1긍정 0 부정

1

In [77]:
'긍정' if new_lrc.predict(review_cv)[0] == 1 else '부정'

'긍정'

In [80]:
# 한개만 하는 경우
rev = '''Having just watched 
The Godfather and Godfather II again after watching and enjoying it so many times, 
I decided to watch this 3rd and last installment again to see if it was really as bad as I remembered. It is. 
The actors are so stiff, so ridiculous, and the script is so idiotic it's almost like a parody of the first two movies. 
If you are a fan of the original Godfather and Godfather II do yourself a favor and don't spoil your memory of those two great movies 
by watching this trash.'''
rev = re.sub('[^A-Za-z]', ' ', rev)
rev_cv = new_cvect.transform([rev])
'긍정' if new_lrc.predict(rev_cv)[0] == 1 else '부정'

'부정'