In [None]:
import pandas as pd
import numpy as np
import math
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
random_seed = 2020

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Text Sentiment Classification
-  inputs: 영화 평점
-  ouput: 긍정(1)  or 부정(0)

## Load dataset

- 데이터셋 불러오기

In [None]:
movie_review = pd.read_csv('/content/drive/MyDrive/skku/2021-1 Data Mining/dm-machine-learning/imdb_dataset.txt', delimiter='\t', compression='zip').reset_index(drop=True).dropna()
movie_review.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [None]:
movie_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
pd.DataFrame(movie_review['sentiment'].value_counts(normalize=True))

Unnamed: 0,sentiment
positive,0.5
negative,0.5


## Split dataset

In [None]:
data = movie_review['review'].values.tolist()
target = movie_review['sentiment'].values.tolist()

In [None]:
print(data[0],'\n', target[0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

## Data preprocessing

- 전처리

In [None]:
! pip install contractions
import re
import contractions

In [None]:
def text_clean(text):
    
    text = contractions.fix(text) #he's -> he is
    text = re.sub('<br />'," ",text)  
    text = re.sub('https?:/\/\S+', ' ', text) # remove urls
    text = re.sub('[0-9]+', ' ', text) # remove numbers
    text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\…》]+', ' ', text) # remove all symbols and punctuation except for . , ! and ?
    text = re.sub("'"," ", text)
    text = re.sub('\s+', ' ', text) # 중복 띄어쓰기 삭제

    text = text.lower() # 소문자
    return text.strip()

In [None]:
text_clean(movie_review.review.iloc[0])

'one of the other reviewers has mentioned that after watching just oz episode you will be hooked they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda them city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows would not dare forget pre

In [None]:
import nltk
nltk.download('all')
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
import spacy
import en_core_web_sm
from spacy.lang.en import English
from tqdm import tqdm

# https://spacy.io/usage

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) #en_core_web_sm : version(small)
spacy_stopwords=spacy.lang.en.stop_words.STOP_WORDS

In [None]:
temp = nlp(movie_review.review.iloc[1])
temp

A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are terribly well done.

In [None]:
def spacy_lemma(sentence):
    text = text_clean(sentence)
    text = nlp(text)
    temp = [token.lemma_ for token in text if token.lemma_ not in ['-PRON-'] + list(spacy_stopwords)] #pron 대명사
    
    return " ".join(temp)

- 전처리한 내용 body에 저장

In [None]:
movie_review['body'] = movie_review.review.apply(lambda x: spacy_lemma(x))

In [None]:
movie_review['body']

0        reviewer mention watch oz episode hook right e...
1        wonderful little production filming technique ...
2        think wonderful way spend time hot summer week...
3        basically family little boy jake think zombie ...
4        petter mattei s love time money visually stunn...
                               ...                        
49995    think movie right good job creative original e...
49996    bad plot bad dialogue bad act idiotic direct a...
49997    catholic teach parochial elementary school nun...
49998    disagree previous comment maltin second rate e...
49999    expect star trek movie high art fan expect mov...
Name: body, Length: 50000, dtype: object

In [None]:
pd.DataFrame(movie_review).to_csv("movie_review.csv")

- 전처리과정이 시간이 지나치게 오래걸리기 때문에 다시 파일을 만들었음 

In [None]:
movie_review = pd.read_csv('/content/drive/MyDrive/skku/2021-1 Data Mining/dm-machine-learning/movie_review.csv',encoding='utf-8')
movie_review.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,body
0,0,One of the other reviewers has mentioned that ...,positive,reviewer mention watch oz episode hook right e...
1,1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...
3,3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love time money visually stunn...


In [None]:
data = movie_review['body'].values.tolist()
target = movie_review['sentiment'].values.tolist()

In [None]:
print(data[0],'\n', target[0])

reviewer mention watch oz episode hook right exactly happen thing strike oz brutality unflinche scene violence set right word trust faint hearted timid pull punch regard drug sex violence hardcore classic use word oz nickname oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass face inward privacy high agenda city home aryan muslims gangstas latinos christians italian irish scuffle death stare dodgy dealing shady agreement far away main appeal fact dare forget pretty picture paint mainstream audience forget charm forget romance oz mess episode strike nasty surreal ready watch develop taste oz accustomed high level graphic violence violence injustice crooked guard sell nickel inmate kill order away mannere middle class inmate turn prison bitch lack street skill prison experience watch oz comfortable uncomfortable view touch dark 
 positive


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=0.3, 
                                                    random_state=random_seed)

In [None]:
print(len(X_train), len(X_test))
print(len(y_train), len(y_test))

35000 15000
35000 15000


- tokenize

In [None]:
# Gensim
import gensim
import gensim.corpora as corpora

In [None]:
new_token = [x.split() for x in movie_review.body]
len(new_token)

50000

In [None]:
new_token[0]

['reviewer',
 'mention',
 'watch',
 'oz',
 'episode',
 'hook',
 'right',
 'exactly',
 'happen',
 'thing',
 'strike',
 'oz',
 'brutality',
 'unflinche',
 'scene',
 'violence',
 'set',
 'right',
 'word',
 'trust',
 'faint',
 'hearted',
 'timid',
 'pull',
 'punch',
 'regard',
 'drug',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'oz',
 'nickname',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focus',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cell',
 'glass',
 'face',
 'inward',
 'privacy',
 'high',
 'agenda',
 'city',
 'home',
 'aryan',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italian',
 'irish',
 'scuffle',
 'death',
 'stare',
 'dodgy',
 'dealing',
 'shady',
 'agreement',
 'far',
 'away',
 'main',
 'appeal',
 'fact',
 'dare',
 'forget',
 'pretty',
 'picture',
 'paint',
 'mainstream',
 'audience',
 'forget',
 'charm',
 'forget',
 'romance',
 'oz',
 'mess',
 'episode',
 'strike',
 'nasty',
 'surreal',
 'ready

In [None]:
dictionary = corpora.Dictionary(new_token)
corpus = [dictionary.doc2bow(texts) for texts in new_token]

In [None]:
corpus[0] # id, frequency

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 2),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 2),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 2),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 3),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 2),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 2),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 6),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 1),
 (73, 3),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 2),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 2),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (100, 1),

In [None]:
len(dictionary)

87132

In [None]:
print(dictionary[66])

order


In [None]:
def my_preprocessor(text):
    return spacy_lemma(text)

In [None]:
def my_tokenizer(text):
    return text.split()

## Word Vectorizer

## CountVectorizer, TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Counter Vectorization으로 피처 벡터화 수행
cnt_vect = CountVectorizer(lowercase=False, tokenizer = my_tokenizer, min_df=5, max_features=1600)
cnt_vect.fit(X_train)

X_train_cnt_vect = cnt_vect.transform(X_train)

#  학습 데이터로 fit()된 Vectorizer를 이용해 테스트 데이터를 피처 벡터화 변환 수행
X_test_cnt_vect = cnt_vect.transform(X_test)



In [None]:
print(X_train_cnt_vect.shape)

(35000, 1600)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# X_train
tfidf_vect = TfidfVectorizer(lowercase=False, tokenizer = my_tokenizer, min_df=5, max_features=1600)

tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)

# X_test
X_test_tfidf_vect = tfidf_vect.transform(X_test)



- 특성값이 높은 단어 확인

In [None]:
max_value = X_train_cnt_vect.max(axis=0).toarray().ravel()
sorted_value = max_value.argsort()
feature_names = np.array(cnt_vect.get_feature_names())
feature_names[sorted_value[-100:]]

array(['good', 'horror', 'demon', 'hotel', 'brian', 'identity', 'bear',
       'david', 'rape', 'radio', 'follow', 'vs', 'dr', 'rock',
       'character', 'role', 'book', 'pretty', 'pilot', 'julie', 'bad',
       'water', 'high', 'michael', 'machine', 'computer', 'wave', 'boss',
       'davis', 'woman', 'bed', 'mask', 'story', 'ray', 'gay', 'buddy',
       ';', 'flick', 'life', 'stewart', 'novel', 'song', 'football',
       'robert', 'danny', 'war', 'monkey', 'post', 'chaplin', 'ben',
       '\x96', 'segment', 'ann', 'soldier', 'jackson', 'force', 'season',
       'love', 'kelly', 'harry', 'dream', 'game', 'mary', 'time',
       'series', 'batman', 'beach', 'james', 'howard', 'master', 'joke',
       'joe', 'child', 'stop', 'mad', 'steve', 'heart', 'great',
       'charlie', 'f', 'joan', 'jane', 'jeff', 'x', 'jason', 'bond',
       'snake', 'rating', 'car', 'guy', 'sam', 'zombie', 'movie', 'match',
       'like', 'rob', 'tony', 'film', '_', 's'], dtype='<U14')

In [None]:
max_value = X_train_tfidf_vect.max(axis=0).toarray().ravel()
sorted_value = max_value.argsort()
feature_names = np.array(tfidf_vect.get_feature_names())
feature_names[sorted_value[-100:]]

array(['channel', 'camp', 'mystery', 'allen', 'vampire', 'bed', 'andy',
       'eddie', ';', 'nightmare', 'hand', 'teach', 'shakespeare',
       'taylor', 'gang', 'vs', 'terrible', 'annoy', 'horse', 'arthur',
       'rape', 'disney', 'jane', 'wow', 'view', 'season', 'joe', 'bad',
       'rob', 'identity', 'double', 'henry', 'computer', 'hot', 'grace',
       'werewolf', 'talk', 'sex', 'monkey', 'football', 'al', 'tired',
       'smith', 'chaplin', 'study', 'ben', 'gary', 'band', 'beach',
       'shoot', 'stewart', 'bob', 'kelly', 'christopher', 'australian',
       'opera', 'jesus', 'episode', 'pack', 'page', 'match', 'stupid',
       'suck', 'jason', 'bond', 'sam', 'awful', 'd', 'version', 'college',
       'p', 'german', 'mary', 'ann', 'cartoon', 'check', 'program', 'f',
       'ray', 'beautifully', 'batman', 'mike', 'jeff', 'master', 'buddy',
       'jerry', 'zombie', '\x96', 'little', 'snake', 'bruce', 'cat',
       'joan', 'game', 'rating', 'joke', 'horrible', 'steve', 'demon',
  

## Model Selection

## Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
def evaluation_report(y_test, pred, is_return=True):
        acc = accuracy_score(y_test, pred) 
        pre=precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        auc = roc_auc_score(y_test, pred)
        for name, value in zip(['accuracy_score', 'precision_score', 'recall_score', 'f1_score', 'roc_auc_score'], [acc, pre, recall, f1, auc]):
            print('{name} = {value:.2f}'.format(name=name, value=value), end='\t')

# Example 1: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter = 200, n_jobs=-1)
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)

In [None]:
test = [1 if y=='positive' else 0 for y in y_test]
ptem = [1 if p=='positive' else 0 for p in pred]

In [None]:
y_test = test
pred = ptem

In [None]:
evaluation_report(test, ptem)

accuracy_score = 0.87	precision_score = 0.87	recall_score = 0.87	f1_score = 0.87	roc_auc_score = 0.87	

In [None]:
print(classification_report(test, ptem))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87      7474
           1       0.87      0.87      0.87      7526

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000



In [None]:
lr_clf = LogisticRegression(max_iter = 200, n_jobs=-1)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)

In [None]:
pred = [1 if p=='positive' else 0 for p in pred]

In [None]:
evaluation_report(y_test, pred)

accuracy_score = 0.87	precision_score = 0.87	recall_score = 0.88	f1_score = 0.87	roc_auc_score = 0.87	

In [None]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      7474
           1       0.87      0.88      0.87      7526

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000



# Example 2: Model Selection

In [None]:
def model_selection(X_train, y_train, kfold=3):
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import LinearSVC
    from xgboost import XGBClassifier
    from sklearn.model_selection import cross_val_score
    from tqdm.notebook import tqdm

    models = [
        LogisticRegression(),
        XGBClassifier(random_state=random_seed),
        LinearSVC(),
    ]
    
    progress_bar = tqdm(total = len(models) * kfold)
    
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model, X_train, y_train, 
                        scoring='accuracy', cv=kfold, n_jobs=-1)
    
        for fold_idx, accuracy in enumerate(accuracies):
            progress_bar.update()
            entries.append((model_name, fold_idx, accuracy))
    
    progress_bar.close()
    
    cv_result = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'], index=range(kfold * len(models)))
    return cv_result

In [None]:
model_selection_result = model_selection(X_train_tfidf_vect, y_train)

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




In [None]:
model_selection_result

Unnamed: 0,model_name,fold_idx,accuracy
0,LogisticRegression,0,0.866804
1,LogisticRegression,1,0.86209
2,LogisticRegression,2,0.862164
3,XGBClassifier,0,0.795492
4,XGBClassifier,1,0.797034
5,XGBClassifier,2,0.800789
6,LinearSVC,0,0.858318
7,LinearSVC,1,0.85609
8,LinearSVC,2,0.85642


In [None]:
pd.pivot_table(model_selection_result, values='accuracy', index=['model_name'], aggfunc=np.mean, fill_value=0)

Unnamed: 0_level_0,accuracy
model_name,Unnamed: 1_level_1
LinearSVC,0.856943
LogisticRegression,0.863686
XGBClassifier,0.797772


# Example 3: Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(max_iter=200)
params = {'penalty': ['l2', 'l1'],
       'C': [0.001, 0.01, 0.1, 1, 5, 10]}

grid_lr_clf = GridSearchCV(lr_clf, 
                           param_grid=params, verbose=1, scoring='accuracy', n_jobs=-1, cv = 3)

# eval_metric: 평가 방식. auc = accuracy
# eval_set: 평가 데이터셋
grid_lr_clf.fit(X_train_tfidf_vect, y_train) 

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    7.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=200, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 5, 10],
                         'penalty': ['l2', 'l1']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

- 최적 파라미터

In [None]:
grid_lr_clf.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

- 최적의 파라미터를 사용한 테스트셋 성능 확인

In [None]:
pred = grid_lr_clf.predict(X_test_cnt_vect)

In [None]:
pred = [1 if p=='positive' else 0 for p in pred]

In [None]:
evaluation_report(y_test, pred)

accuracy_score = 0.86	precision_score = 0.84	recall_score = 0.89	f1_score = 0.87	roc_auc_score = 0.86	

In [None]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.83      0.86      7474
           1       0.84      0.89      0.87      7526

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000

