In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import string
from bs4 import BeautifulSoup #remove HTML tagging
from tqdm import tqdm #time monitoring

### Preprocessing

In [2]:
df= pd.read_csv("C:/Users/Samuele/Downloads/IMDBreviews.csv")
df = df[:10000] # we use only ten thousand for computational reasons

In [3]:
df.head() #notice the HTML tagging!

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
corpus = pd.Series(df.review.tolist()).astype(str)
i=0
for review in tqdm(corpus):
    nohtml = BeautifulSoup(corpus[i])
    corpus[i] = nohtml.get_text()
    i+=1
print(corpus.head())

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 2758.25it/s]

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
dtype: object





In [5]:
corpus.head() #there are some html formatting terms like <br

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
dtype: object

With the following function we remove stopwords (useless words) and punctuation

In [6]:
def stopwords_deleter(corpus, print_opt = False):
    stop = set(stopwords.words('english'))
    words = []
    for word in corpus.split():
        if word.lower().translate(str.maketrans('','',string.punctuation)) not in stop:
            words.append(word.translate(str.maketrans('','',string.punctuation)))   
    nostopw_corpus = " ".join(words)
    if print_opt == True:
        print("Old length: ", len(corpus))
        print("New length: ", len(nostopw_corpus))
    return(nostopw_corpus)

In [7]:
stopwords_deleter(corpus[0], print_opt = True)  #this is what is happening with one text

Old length:  1725
New length:  1159


'One reviewers mentioned watching 1 Oz episode youll hooked right exactly happened meThe first thing struck Oz brutality unflinching scenes violence set right word GO Trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordIt called OZ nickname given Oswald Maximum Security State Penitentary focuses mainly Emerald City experimental section prison cells glass fronts face inwards privacy high agenda Em City home manyAryans Muslims gangstas Latinos Christians Italians Irish moreso scuffles death stares dodgy dealings shady agreements never far awayI would say main appeal show due fact goes shows wouldnt dare Forget pretty pictures painted mainstream audiences forget charm forget romanceOZ doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste Oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered middle 

In [8]:
preproc_reviews = pd.Series(dtype="str")
i = 0
for review in tqdm(corpus):
    preproc_reviews = preproc_reviews.append(pd.Series([stopwords_deleter(review)], index=[i]), ignore_index=True)
    i += 1

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:16<00:00, 597.90it/s]


In [9]:
preproc_reviews.head()

0    One reviewers mentioned watching 1 Oz episode ...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    Basically theres family little boy Jake thinks...
4    Petter Matteis Love Time Money visually stunni...
dtype: object

### Lemmatization

In [10]:
def Lemmatizer(corpus, print_opt=False):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for word in corpus.split():
        lemmatized.append(lemmatizer.lemmatize(word))
    lemmatized_corpus = " ".join(lemmatized)  # Join sentences with spaces
    if print_opt:
        print("Old length: ", len(corpus))
        print("New length: ", len(stemmed_corpus))
    return lemmatized_corpus

In [11]:
lemm_reviews = pd.Series(dtype="str")
i = 0
for review in tqdm(preproc_reviews):
    lemm_reviews = lemm_reviews.append(pd.Series([Lemmatizer(review)], index=[i]), ignore_index=True)
    i += 1

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:12<00:00, 818.62it/s]


Here is how it looks

In [12]:
lemm_reviews[0]

'One reviewer mentioned watching 1 Oz episode youll hooked right exactly happened meThe first thing struck Oz brutality unflinching scene violence set right word GO Trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordIt called OZ nickname given Oswald Maximum Security State Penitentary focus mainly Emerald City experimental section prison cell glass front face inwards privacy high agenda Em City home manyAryans Muslims gangsta Latinos Christians Italians Irish moreso scuffle death stare dodgy dealing shady agreement never far awayI would say main appeal show due fact go show wouldnt dare Forget pretty picture painted mainstream audience forget charm forget romanceOZ doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste Oz got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate turned pris

### Sentiment analysis

In [13]:
import textblob

In [14]:
i = 0
for review, sentiment in zip(lemm_reviews, df.sentiment):
    print('Actual Sentiment:', sentiment)
    print('Predicted Sentiment polarity:', textblob.TextBlob(review).sentiment.polarity)
    i+=1
    print('-'*60)
    if i == 10:
        break

Actual Sentiment: positive
Predicted Sentiment polarity: 0.025586734693877542
------------------------------------------------------------
Actual Sentiment: positive
Predicted Sentiment polarity: 0.12760416666666669
------------------------------------------------------------
Actual Sentiment: positive
Predicted Sentiment polarity: 0.26473214285714286
------------------------------------------------------------
Actual Sentiment: negative
Predicted Sentiment polarity: -0.0421875
------------------------------------------------------------
Actual Sentiment: positive
Predicted Sentiment polarity: 0.24490093240093241
------------------------------------------------------------
Actual Sentiment: positive
Predicted Sentiment polarity: 0.045454545454545456
------------------------------------------------------------
Actual Sentiment: positive
Predicted Sentiment polarity: 0.2587121212121212
------------------------------------------------------------
Actual Sentiment: negative
Predicted Senti

By looking at the first 10 results, it seems promising

In [15]:
sentiment_polarity = [textblob.TextBlob(review).sentiment.polarity for review in lemm_reviews]
predicted_sentiments = ['positive' if score >= 0.1 else 'negative' for score in sentiment_polarity]
predicted_sentiments[0:10]

['negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive']

In [16]:
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

In [17]:
label_mapping = {'negative': 0, 'positive': 1}
predicted_remapped = [label_mapping[label] for label in predicted_sentiments]
review_remapped = [label_mapping[label] for label in df.sentiment]

print(confusion_matrix(review_remapped, predicted_remapped))
print("Accuracy Score: ", round(accuracy_score(review_remapped, predicted_remapped),3))
print("Precision Score: ", round(precision_score(review_remapped, predicted_remapped),3))
print("Recall Score: ", round(recall_score(review_remapped, predicted_remapped),3))
print("F1 Score: ", round(f1_score(review_remapped, predicted_remapped),3))

[[3795 1177]
 [1202 3826]]
Accuracy Score:  0.762
Precision Score:  0.765
Recall Score:  0.761
F1 Score:  0.763


Not bad!