In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')


In [2]:
df = pd.read_csv("./data/IMDB Dataset.csv")

In [3]:
import os
os.listdir("./data")[:10]


['IMDB Dataset.csv']

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
df.sample(5)

Unnamed: 0,review,sentiment
38374,"I went to see ""TKIA"" with high expectations, w...",negative
46635,I am sorry to say that it was one of the worst...,negative
44425,I decided to watch this show and give it a go ...,negative
30115,My first clue about how bad this was going to ...,negative
8325,After reading more than my fair share of revie...,positive


In [6]:
df_pos = df[df['sentiment']=='positive'][:5000]
df_neg = df[df['sentiment']=='negative'][:5000]

df_reviews = pd.concat([df_pos, df_neg ])

In [7]:
df_reviews.sentiment.value_counts()


sentiment
positive    5000
negative    5000
Name: count, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split


In [9]:
train,test = train_test_split(df_reviews,test_size =0.33,random_state=42)

In [10]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

In [11]:
train_y.value_counts()

sentiment
negative    3378
positive    3322
Name: count, dtype: int64

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
test_x_vector = tfidf.transform(test_x)

In [14]:
train_x.shape

(6700,)

In [15]:
train_x_vector.shape

(6700, 44107)

In [16]:
type(train_x_vector)

scipy.sparse._csr.csr_matrix

In [17]:
primera_resenia = pd.DataFrame.sparse.from_spmatrix(train_x_vector,
                                  index=train_x.index,
                                  columns=tfidf.get_feature_names_out()).iloc[0]

In [18]:
primera_resenia

00           0
000          0
007          0
00am         0
00s          0
            ..
ísnt         0
île          0
önsjön       0
über         0
überwoman    0
Name: 6746, Length: 44107, dtype: Sparse[float64, 0]

In [19]:
train_x.iloc[0]

"I happened to rent this movie with my sister in hopes of watching a great entertaining movie, that was humorous, however my expectations were let down. This movie was beyond disgusting and revolting for a PG-13 movie, this should have been rated R for the many mature references that went on in this movie. I wouldn't recommend allowing a 13 year old teen see this.<br /><br />Even if no one under the age of 17 is watching this movie, beware of a truly stupid movie, there's no humor in the movie, just a bunch of disgusting sexual references including a small touch of pedophilia, something that shouldn't even be joked about. <br /><br />I would like to know what happened to PG-13 movies, that were actually safe for actual a 13 year old? This is beyond a deplorable movie and should be re-rated."

In [20]:
primera_resenia[primera_resenia != 0]

13               0.45849
17               0.12824
actual          0.091601
actually        0.061461
age             0.088765
allowing         0.12824
beware          0.143046
br              0.124945
bunch           0.093128
deplorable      0.168137
disgusting      0.237945
entertaining    0.081088
expectations    0.103149
great           0.048868
happened        0.176853
hopes           0.114028
humor           0.084465
humorous        0.116516
including       0.087603
joked           0.168137
just            0.037675
know            0.053984
let              0.07195
like            0.036454
mature          0.126021
movie           0.276309
movies            0.0522
old             0.123513
pedophilia      0.172712
pg              0.260154
rated           0.206643
recommend       0.076198
references      0.226901
rent            0.093234
revolting       0.151971
safe            0.119732
sexual          0.098677
shouldn         0.108203
sister          0.095008
small           0.078916


In [21]:
train_x.iloc[0]

"I happened to rent this movie with my sister in hopes of watching a great entertaining movie, that was humorous, however my expectations were let down. This movie was beyond disgusting and revolting for a PG-13 movie, this should have been rated R for the many mature references that went on in this movie. I wouldn't recommend allowing a 13 year old teen see this.<br /><br />Even if no one under the age of 17 is watching this movie, beware of a truly stupid movie, there's no humor in the movie, just a bunch of disgusting sexual references including a small touch of pedophilia, something that shouldn't even be joked about. <br /><br />I would like to know what happened to PG-13 movies, that were actually safe for actual a 13 year old? This is beyond a deplorable movie and should be re-rated."

In [22]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [23]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['I did not like this movie at all I gave this movie away'])))

['positive']
['positive']
['negative']


In [24]:
print(svc.score(test_x_vector, test_y))

0.8706060606060606


In [25]:
from sklearn.metrics import f1_score

f1_score(test_y,svc.predict(test_x_vector),
          labels = ['positive','negative'],average=None)

array([0.87400413, 0.86701962])

In [26]:
from sklearn.metrics import classification_report

print(classification_report(test_y,
                            svc.predict(test_x_vector),
                            labels = ['positive','negative']))

              precision    recall  f1-score   support

    positive       0.87      0.88      0.87      1678
    negative       0.88      0.86      0.87      1622

    accuracy                           0.87      3300
   macro avg       0.87      0.87      0.87      3300
weighted avg       0.87      0.87      0.87      3300



In [27]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(test_y,
                           svc.predict(test_x_vector),
                           labels = ['positive', 'negative'])
conf_mat

array([[1481,  197],
       [ 230, 1392]])

In [28]:
import sys
print(sys.executable)


/Users/carolinavaladezgarrido/Documents/analisis_de_sentimientos/.venv/bin/python


In [29]:
import sys
print(sys.executable)


/Users/carolinavaladezgarrido/Documents/analisis_de_sentimientos/.venv/bin/python


In [33]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=2000)
logreg.fit(train_x_vector, train_y)

pred_lr = logreg.predict(test_x_vector)

print("Accuracy Logistic Regression:", logreg.score(test_x_vector, test_y))

from sklearn.metrics import classification_report
print(classification_report(test_y, pred_lr, labels=['positive','negative']))


Accuracy Logistic Regression: 0.8718181818181818
              precision    recall  f1-score   support

    positive       0.87      0.88      0.87      1678
    negative       0.88      0.86      0.87      1622

    accuracy                           0.87      3300
   macro avg       0.87      0.87      0.87      3300
weighted avg       0.87      0.87      0.87      3300



In [31]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(train_x_vector, train_y)

pred_mnb = mnb.predict(test_x_vector)

print("Accuracy MultinomialNB:", mnb.score(test_x_vector, test_y))
print(classification_report(test_y, pred_mnb, labels=['positive','negative']))


Accuracy MultinomialNB: 0.8542424242424242
              precision    recall  f1-score   support

    positive       0.91      0.80      0.85      1678
    negative       0.81      0.91      0.86      1622

    accuracy                           0.85      3300
   macro avg       0.86      0.86      0.85      3300
weighted avg       0.86      0.85      0.85      3300



In [32]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(train_x_vector, train_y)

pred_dt = dt.predict(test_x_vector)

print("Accuracy Decision Tree:", dt.score(test_x_vector, test_y))
print(classification_report(test_y, pred_dt, labels=['positive','negative']))


Accuracy Decision Tree: 0.7163636363636363
              precision    recall  f1-score   support

    positive       0.73      0.71      0.72      1678
    negative       0.71      0.72      0.71      1622

    accuracy                           0.72      3300
   macro avg       0.72      0.72      0.72      3300
weighted avg       0.72      0.72      0.72      3300

