In [1]:
# Data Manipulating and Visualization
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import random


# Operating System
import os
from datetime import datetime

# Machine Learning Algorithms
import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# Performance metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score

# Hyperparameter
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

## Mathematics and Statistics
import scipy.stats as stats
from scipy.stats import uniform
from scipy.stats import loguniform

# NLP related 
import string
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

### [Introduction to the IMDB Dataset](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

In [2]:
review_df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

# 1. Data Exploration Analysis and Data Treatment
## 1.1 Diuplicated reviews

In [3]:
review_df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [4]:
review_df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

**As we can see that are 49,582 unique reviews. That means there are duplicated reviews.**

**Some of the duplicated reviews are printed below.**

In [5]:
duplicates = review_df['review'].duplicated(keep=False)
review_df.loc[duplicates,:].sort_values('review').head(10)

Unnamed: 0,review,sentiment
34058,"""Go Fish"" garnered Rose Troche rightly or wron...",negative
47467,"""Go Fish"" garnered Rose Troche rightly or wron...",negative
29956,"""Three"" is a seriously dumb shipwreck movie. M...",negative
31488,"""Three"" is a seriously dumb shipwreck movie. M...",negative
47527,"""Witchery"" might just be the most incoherent a...",negative
2976,"""Witchery"" might just be the most incoherent a...",negative
7949,'Dead Letter Office' is a low-budget film abou...,negative
32260,'Dead Letter Office' is a low-budget film abou...,negative
18022,(Spoilers)<br /><br />Oh sure it's based on Mo...,negative
22449,(Spoilers)<br /><br />Oh sure it's based on Mo...,negative


**Drop the duplicated rows**
- The [pandas.DataFrame.drop_duplicates](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html) function is applied. 

In [6]:
review_df_dd = review_df.drop_duplicates(ignore_index=True)
review_df_dd.describe()

Unnamed: 0,review,sentiment
count,49582,49582
unique,49582,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,24884


## 1.2 Data Treatment

In [7]:
review_df_dd.loc[11,'review']

"I saw this movie when I was about 12 when it came out. I recall the scariest scene was the big bird eating men dangling helplessly from parachutes right out of the air. The horror. The horror.<br /><br />As a young kid going to these cheesy B films on Saturday afternoons, I still was tired of the formula for these monster type movies that usually included the hero, a beautiful woman who might be the daughter of a professor and a happy resolution when the monster died in the end. I didn't care much for the romantic angle as a 12 year old and the predictable plots. I love them now for the unintentional humor.<br /><br />But, about a year or so later, I saw Psycho when it came out and I loved that the star, Janet Leigh, was bumped off early in the film. I sat up and took notice at that point. Since screenwriters are making up the story, make it up to be as scary as possible and not from a well-worn formula. There are no rules."

**Take a look at one sample review. There are several things we need to do:**
1. Remove the "br/" tags
2. Remove stop words
3. Remove punctuations
4. Text Stemming 

In [8]:
##### Returns the lowercase string from the given string
def lower_string(string):
    return string.lower()

##### Remove special terms
def remove_special_strips(text,term):
    return text.replace(term, "")

##### Remove punctuations and stop words
# 1: Tokenize the text.
# 2: Remove tokens that are stop words or punctuations
# (note that punctuations that are not in string.punctuation will be kept.)
# (note that removing punctuations in numbers might change the meaning, e.g.,, "1.5" becomes "15", 100% becomes 100).
# 3: If a sentence ends with a period, the last word and the period are combined into a single token. Remove such periods.
def text_treatment(text):
    text = lower_string(text)
    
    text = remove_special_strips(text,'<br />')
    
    stop_punc = set(stopwords.words('english') + list(string.punctuation))
    tokenizer=ToktokTokenizer()
    filtered_tokens = [i for i in tokenizer.tokenize(text) if i not in stop_punc]
    filtered_tokens = [i.replace('.', "") for i in filtered_tokens]
    filtered_text = ' '.join(filtered_tokens) 
    return(filtered_text)

##### Text stemming
# See a comparison between two stemmers from NLTK
# https://www.nltk.org/howto/stem.html
def stemmer(text):
    sinlge_stemmer = PorterStemmer()
    words = text.split()
    words_stem = [sinlge_stemmer.stem(i) for i in words]
    text_stem  = ' '.join(words_stem)
    return text_stem

[Text Stemmers from NLTK](https://www.nltk.org/howto/stem.html)

**Remark:** 
- **An alternative to stemming is lemmatizing (not used for this study). Lemmatization produces a linguistically valid word while stemming is faster but may generate non-words. Meanwhile, lemmatization is computationally expensive since it involves look-up tables and what not.**
- **In this study, PorterStemmer is used. The codes and results below provide a comparison between two stemmers from NLTK. [Also see the NLTK reference here](https://www.nltk.org/howto/stem.html).**

In [9]:
words = ['caresses', 'flies', 'denied', 'agreed','meeting', 'stating', 'sensational', 
           'reference', 'colonizer','plotted','running','generously','happily','successfully']

compare_stemmers = pd.DataFrame(columns=['Original','Porter','Snowball'],index=range(len(words)))
compare_stemmers['Original'] = words

sinlge_stemmer = PorterStemmer()
singles = [sinlge_stemmer.stem(i) for i in words]

snowball_stemmer = SnowballStemmer("english")
snowballs = [snowball_stemmer.stem(i) for i in words]

compare_stemmers['Porter']   = singles
compare_stemmers['Snowball'] = snowballs

compare_stemmers

Unnamed: 0,Original,Porter,Snowball
0,caresses,caress,caress
1,flies,fli,fli
2,denied,deni,deni
3,agreed,agre,agre
4,meeting,meet,meet
5,stating,state,state
6,sensational,sensat,sensat
7,reference,refer,refer
8,colonizer,colon,colon
9,plotted,plot,plot


**Compare the texts before and after these treatments. A review randomly selected from the first 100 rows.**

In [10]:
a = np.random.randint(100)
print('Review '+str(a))
text = review_df_dd.loc[a,'review']
print('------original text------')
print(text)
print()

print('------after removing punctuations and stop words------')
text = text_treatment(text)
print(text)
print()

print('------after stemming------')
text= stemmer(text)
print(text)

Review 79
------original text------
This film took me by surprise. I make it a habit of finding out as little as possible about films before attending because trailers and reviews provide spoiler after spoiler. All I knew upon entering the theater is that it was a documentary about a long married couple and that IMDb readers gave it a 7.8, Rotten Tomatoes users ranked it at 7.9 and the critics averaged an amazing 8.2! If anything, they UNDERRATED this little gem.<br /><br />Filmmaker Doug Block decided to record his parents "for posterity" and at the beginning of the film we are treated to the requisite interviews with his parents, outspoken mother Mina, and less than forthcoming dad, Mike. I immediately found this couple interesting and had no idea where the filmmaker (Mike & Mina's son Doug) was going to take us. As a matter of fact, I doubt that Doug himself knew where he was going with this!<br /><br />Life takes unexpected twists and turns and this beautifully expressive film foll

**Apply the function to the entire review dataframe, and save the data**

In [11]:
review_df_model = review_df_dd.copy()
review_df_model['review']=review_df_model['review'].apply(text_treatment)
review_df_model['review']=review_df_model['review'].apply(stemmer)

# 2 Creating TFIDF features
- An alternative is **Bag of Words** (BoW) simply counts the frequency of words in a document. Thus the vector for a document has the frequency of each word in the corpus for that document. The key difference between bag of words and TF-IDF is that the former does not incorporate any sort of inverse document frequency (IDF)  and is only a frequency count (TF).
- The [sklearn.feature_extraction.text.TfidfVectorizer()](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) function is used for creating the TF-IDF features.
- See more about TFIDF in [this notebook](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/IMDB_Reviews/tfidf.ipynb)

#### Number of reviews in the data: 49,582

In [12]:
corpus= list(review_df_model['review'])
len(corpus)

49582

#### The total number of words contained in all reviews' vocabulary：130,312

In [13]:
##### tfidf_0 is a TfidfVectorizer with no constraints
tfidf_0 = TfidfVectorizer()
##### fit the TfidfVectorizer using corpus
result  = tfidf_0.fit_transform(corpus)
##### number of words(tokens) in the vocabulary
vocabulary = tfidf_0.vocabulary_
print("There are %d words in the corpus." %len(vocabulary))

There are 130312 words in the corpus.


### Parameters for creating TFIDF features
- Without these parameters, all the 130,312 words in the vocabulary will be considered, which is not optimal.
- **max_features**: If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. Otherwise, all features are used.
- **min_df**: Ignore terms that have a document frequency strictly lower than the given threshold. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. 

In [14]:
def create_tfidf_df(max_feature,min_df,corpus):
    Vectorizer = TfidfVectorizer(max_features=max_feature,min_df=min_df)
    result     = Vectorizer.fit_transform(corpus)
    tfidf_df   = pd.DataFrame(result.toarray(),columns=Vectorizer.get_feature_names_out())
    return(tfidf_df)

### Define 12 **TfidfVectorizer()** using different combinations of the two parameters
* #### *max_features*: 10000, 5000, 2000 
* #### *min_df*: 0(i.e., no limit), 0,01, 0.02, 0.1

In [15]:
tfidf_df_1  = create_tfidf_df(10000,0,corpus)
tfidf_df_2  = create_tfidf_df(10000,0.01,corpus)
tfidf_df_3  = create_tfidf_df(10000,0.02,corpus)
tfidf_df_4  = create_tfidf_df(10000,0.1,corpus)
tfidf_df_5  = create_tfidf_df(5000,0,corpus)
tfidf_df_6  = create_tfidf_df(5000,0.01,corpus)
tfidf_df_7  = create_tfidf_df(5000,0.02,corpus)
tfidf_df_8  = create_tfidf_df(5000,0.1,corpus)
tfidf_df_9  = create_tfidf_df(2000,0,corpus)
tfidf_df_10 = create_tfidf_df(2000,0.01,corpus)
tfidf_df_11 = create_tfidf_df(2000,0.02,corpus)
tfidf_df_12 = create_tfidf_df(2000,0.1,corpus)

#### The numbers of TFIDF features using different combinations of max_features and min_df are shown below. The results imply that if we fix the min_df, it does not matter whether we consider the top 10000, 5000, or 2000 features.

#### Going forward, we will use the top 2000 TFIDF features with no min_df constraints, and conduct thefeature selection from there. 

In [16]:
t = pd.DataFrame(columns = ['max_feature','min_df','N_of_columns'])
t.loc[1,:]  = [10000,'no limit',tfidf_df_1.shape[1]]             
t.loc[2,:]  = [10000,'0.01',    tfidf_df_2.shape[1]]            
t.loc[3,:]  = [10000,'0.02',    tfidf_df_3.shape[1]]            
t.loc[4,:]  = [10000,'0.1',     tfidf_df_4.shape[1]]            
t.loc[5,:]  = [5000,'no limit', tfidf_df_5.shape[1]]     
t.loc[6,:]  = [5000,'0.01',     tfidf_df_6.shape[1]]        
t.loc[7,:]  = [5000,'0.02',     tfidf_df_7.shape[1]]            
t.loc[8,:]  = [5000,'0.1',      tfidf_df_8.shape[1]]            
t.loc[9,:]  = [2000,'no limit', tfidf_df_9.shape[1]]            
t.loc[10,:] = [2000,'0.01',     tfidf_df_10.shape[1]]            
t.loc[11,:] = [2000,'0.02',     tfidf_df_11.shape[1]]            
t.loc[12,:] = [2000,'0.1',      tfidf_df_12.shape[1]]            
t.sort_values(by='N_of_columns', ascending=False)

Unnamed: 0,max_feature,min_df,N_of_columns
1,10000,no limit,10000
5,5000,no limit,5000
9,2000,no limit,2000
2,10000,0.01,1645
6,5000,0.01,1645
10,2000,0.01,1645
3,10000,0.02,933
7,5000,0.02,933
11,2000,0.02,933
4,10000,0.1,137


### Convert the label of 'sentiment' into a 0-1 variable: "positive" = 1, "negative" = 0.
### Note that in the TFIDF dataframe there is one column called "sentiment", and one column called "review", which are same with the first and the second columns in the original review_df data. Need to keep that in mind and avoid duplicated column names.

In [17]:
review_df_model = review_df_model.rename(columns={'review':'movie_review'}) 
review_df_model['sentiment_label']  = review_df_model['sentiment']

### Convert the label of 'sentiment' into a 0-1 variable: "positive" = 1, "negative" = 0.
review_df_model['sentiment_number'] = (review_df_model['sentiment'] == 'positive').astype(int)

### Drop the "sentiment" column. 
review_df_model.drop(['sentiment'],axis=1,inplace=True)

### Merge the TFIDF features
Modeling_Date = pd.concat([review_df_model,tfidf_df_9],axis=1)

### print 3 rows 
Modeling_Date.head(3)

Unnamed: 0,movie_review,sentiment_label,sentiment_number,000,10,100,11,12,13,15,...,yeah,year,yet,york,you,young,younger,youth,zero,zombi
0,one review mention watch 1 oz episod hook righ...,positive,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,wonder littl product film techniqu unassuming-...,positive,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,thought wonder way spend time hot summer weeke...,positive,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.075983,0.0,0.0,0.0,0.093514,0.0,0.0,0.0,0.0


In [18]:
Modeling_Date.to_csv('/kaggle/working/Modeling_Date.csv',index=False)

In [19]:
X_Data = Modeling_Date.drop(['movie_review','sentiment_label','sentiment_number'], axis=1)
Y_Data = Modeling_Date.sentiment_number
x1,x2,y1,y2 = train_test_split(X_Data,Y_Data,test_size = 0.3,random_state = 42)

params_LGB= {'boosting_type'    : 'gbdt',
             'objective'        : 'binary',
             'colsample_bytree' : 0.8,
             'learning_rate'    : 0.05,
             'min_child_samples': 10,
             'min_child_weight' : 5,
             'max_depth'        : -1,
             'min_split_gain'   : 0,
             'num_leaves'       : 31,
             'subsample_for_bin': 50000,
             'subsample_freq'   : 1
}

In [20]:
LGB_2000 = lgb.LGBMClassifier(**params_LGB,importance_type='gain')
LGB_2000.fit(X = x1, y = y1,
             eval_metric=['auc','logloss'], eval_set=[(x1,y1),(x2,y2)],
             callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
importance_df = pd.DataFrame(list(x1)).rename(columns={0:'Features'})
importance_df['importance'] = LGB_2000.feature_importances_
importance_df = importance_df.sort_values(by=['importance'],ascending=False)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.938437	training's binary_logloss: 0.35314	valid_1's auc: 0.912574	valid_1's binary_logloss: 0.39339


In [21]:
importance_df

Unnamed: 0,Features,importance
160,bad,25628.850548
1978,worst,20144.535370
1923,wast,17687.451521
797,great,10678.459848
151,aw,9401.900133
...,...,...
703,follow,0.000000
702,folk,0.000000
701,focus,0.000000
700,focu,0.000000
