In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_csv('sample_data/IMDB_Dataset.csv', encoding='ISO-8859-1')

In [None]:
df["sentiment"]=pd.factorize(df["sentiment"])[0].astype(int)

In [None]:
y=df["sentiment"]
y

0        0
1        0
2        0
3        1
4        0
        ..
49995    0
49996    1
49997    1
49998    1
49999    1
Name: sentiment, Length: 50000, dtype: int64

In [None]:
df["review"]

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [None]:
df['review']=df['review'].str.lower()

In [None]:
df['review'] = df['review'].str.replace(r'[^\w\s]+', '')

  df['review'] = df['review'].str.replace(r'[^\w\s]+', '')


In [None]:
!pip install nltk

import nltk

nltk.download('punkt')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\s+', gaps = True)
df['review'] = df['review'].apply(tokenizer.tokenize)
df['review'].tail()

49995    [i, thought, this, movie, did, a, down, right,...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, am, a, catholic, taught, in, parochial, el...
49998    [im, going, to, have, to, disagree, with, the,...
49999    [no, one, expects, the, star, trek, movies, to...
Name: review, dtype: object

In [None]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is','it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [None]:
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['review'] = df['review'].apply(lambda text: cleaning_stopwords(text))
df['review'].head()

0    ['one', 'of', 'the', 'other', 'reviewers', 'ha...
1    ['a', 'wonderful', 'little', 'production', 'br...
2    ['i', 'thought', 'this', 'was', 'a', 'wonderfu...
3    ['basically', 'theres', 'a', 'family', 'where'...
4    ['petter', 'matteis', 'love', 'in', 'the', 'ti...
Name: review, dtype: object

In [None]:
from nltk.stem.porter import PorterStemmer


stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["review"] = df["review"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,review,sentiment
0,"['one', 'of', 'the', 'other', 'reviewers', 'ha...",0
1,"['a', 'wonderful', 'little', 'production', 'br...",0
2,"['i', 'thought', 'this', 'was', 'a', 'wonderfu...",0
3,"['basically', 'theres', 'a', 'family', 'where'...",1
4,"['petter', 'matteis', 'love', 'in', 'the', 'ti...",0


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import re
def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)
df['review'] = df['review'].apply(lambda x: cleaning_repeating_char(x))
df['review'].tail()


49995    ['i', 'thought', 'this', 'movie', 'did', 'a', ...
49996    ['bad', 'plot', 'bad', 'dialogue', 'bad', 'act...
49997    ['i', 'am', 'a', 'catholic', 'taught', 'in', '...
49998    ['im', 'going', 'to', 'have', 'to', 'disagree'...
49999    ['no', 'one', 'expects', 'the', 'star', 'trek'...
Name: review, dtype: object

In [None]:
def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
df['review'] = df['review'].apply(lambda x: cleaning_URLs(x))
df['review'].tail()

49995    ['i', 'thought', 'this', 'movie', 'did', 'a', ...
49996    ['bad', 'plot', 'bad', 'dialogue', 'bad', 'act...
49997    ['i', 'am', 'a', 'catholic', 'taught', 'in', '...
49998    ['im', 'going', 'to', 'have', 'to', 'disagree'...
49999    ['no', 'one', 'expects', 'the', 'star', 'trek'...
Name: review, dtype: object

In [None]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
df['review'] = df['review'].apply(lambda x: cleaning_numbers(x))
df['review'].tail()

49995    ['i', 'thought', 'this', 'movie', 'did', 'a', ...
49996    ['bad', 'plot', 'bad', 'dialogue', 'bad', 'act...
49997    ['i', 'am', 'a', 'catholic', 'taught', 'in', '...
49998    ['im', 'going', 'to', 'have', 'to', 'disagree'...
49999    ['no', 'one', 'expects', 'the', 'star', 'trek'...
Name: review, dtype: object

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["review"] = df["review"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,review,sentiment
0,"['one', 'of', 'the', 'other', 'reviewers', 'ha...",0
1,"['a', 'wonderful', 'little', 'production', 'br...",0
2,"['i', 'thought', 'this', 'was', 'a', 'wonderfu...",0
3,"['basically', 'theres', 'a', 'family', 'where'...",1
4,"['petter', 'matteis', 'love', 'in', 'the', 'ti...",0


In [None]:
from sklearn.model_selection import train_test_split
df_train,df_test,y_train,y_test=train_test_split(df["review"],y,test_size=0.20,random_state=42)
print('DF Train Shape: ',df_train.shape)
print('DF Test Shape: ',df_test.shape)
print('Y Train Shape: ',y_train.shape)
print('Y Test Shape: ',y_test.shape)

DF Train Shape:  (40000,)
DF Test Shape:  (10000,)
Y Train Shape:  (40000,)
Y Test Shape:  (10000,)


In [None]:
#vectorizer = TfidfVectorizer()
#X_train = vectorizer.fit_transform(df_train)
#X_test = vectorizer.transform(df_test)

from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['review'])

Train_X_Tfidf = Tfidf_vect.transform(df_train)
Test_X_Tfidf = Tfidf_vect.transform(df_test)



In [None]:
param_grid = {
    'hidden_layer_sizes': [(150,100,50), (120,80,40), (100,50,30)],
    'max_iter': [50, 100, 150],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(mlp, param_grid, n_jobs= -1, cv=5)
grid.fit(Train_X_Tfidf, y_train)

print(grid.best_params_)

grid_predictions = grid.predict(Test_X_Tfidf)

print('Accuracy: {:.2f}'.format(accuracy_score(y_test, grid_predictions)))

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42,activation = 'relu',
                        solver = 'adam')
mlp.fit(Train_X_Tfidf, y_train)

In [None]:
accuracy = mlp.score(Test_X_Tfidf, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8707
