In [1]:
import pandas as pd
data = pd.read_csv("IMDB Dataset.csv")

In [2]:
import re
def clean_text(text):
    text = re.sub(r'<.*?>','',text)
    text = re.sub(r'^a-zA-Z\s','',text)
    text = text.lower()
    return text
data['review']= data['review'].apply(clean_text)

In [3]:
print(data.head())

                                              review sentiment
0  one of the other reviewers has mentioned that ...  positive
1  a wonderful little production. the filming tec...  positive
2  i thought this was a wonderful way to spend ti...  positive
3  basically there's a family where a little boy ...  negative
4  petter mattei's "love in the time of money" is...  positive


In [4]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# downloading stop-words from NLTK
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

stemmer = PorterStemmer()

def preprocess_text(text):
    words = text.split()
    words = [ word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

data['review'] = data['review'].apply(preprocess_text)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yahya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
data['sentiment'] = data['sentiment'].map({'positive':1,'negative':0})

In [7]:
print(data.head())

                                              review  sentiment
0  one review mention watch 1 oz episod hooked. r...          1
1  wonder littl production. film techniqu unassum...          1
2  thought wonder way spend time hot summer weeke...          1
3  basic there' famili littl boy (jake) think the...          0
4  petter mattei' "love time money" visual stun f...          1


In [8]:
print(data.isnull().sum())

review       0
sentiment    0
dtype: int64


In [9]:
print(data['sentiment'].value_counts())


sentiment
1    25000
0    25000
Name: count, dtype: int64


In [10]:
pip install -U scikit-learn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
#Preparing data and test sets
from sklearn.model_selection import train_test_split

X = data['review']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print(f"training data set{len(X_train)}")

print(f"testing data set{len(y_test)}")

training data set40000
testing data set10000


In [12]:
print(f"train review data examples: \n{X_train.head(5)}")
print(f"train sentiment data examples: \n{y_train.head(5)}")
print(f"test review data examples: \n{X_test.head(5)}")
print(f"test sentiment data examples: \n{y_test.head(5)}")



train review data examples: 
39087    that' kept ask mani fights, scream matches, sw...
30893    watch entir movie. could watch entir movie. st...
45278    touch love stori reminisc in mood love'. draw...
16398    latter-day fulci schlocker total abysm concoct...
13653    first all, firmli believ norwegian movi contin...
Name: review, dtype: object
train sentiment data examples: 
39087    0
30893    0
45278    1
16398    0
13653    0
Name: sentiment, dtype: int64
test review data examples: 
33553    realli like summerslam due look arena, curtain...
9427     mani televis show appeal quit mani differ kind...
199      film quickli get major chase scene ever increa...
12447    jane austen would definit approv one!gwyneth p...
39489    expect somewhat high went see movie, thought s...
Name: review, dtype: object
test sentiment data examples: 
33553    1
9427     1
199      0
12447    1
39489    0
Name: sentiment, dtype: int64


In [None]:
# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000,stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [36]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_tfidf,y_train)

In [37]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4961
           1       0.87      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [38]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_tfidf,y_train)

In [39]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.83      0.85      4961
           1       0.84      0.87      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [41]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(X_train_tfidf,y_train)

In [42]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88      4961
           1       0.87      0.89      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



SAVING TRAINED MODEL AND VECTOIZER

In [45]:
import pickle

model = model

with open('model.pkl','wb') as file:
    pickle.dump(model,file)

In [50]:
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

LOADİNG TRAINED MODEL 

In [46]:
import pickle
with open('model.pkl','rb') as file : 
    loaded_model = pickle.load(file)
    

In [48]:
predictions = loaded_model.predict(X_test_tfidf)

In [49]:
from sklearn.metrics import classification_report


print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88      4961
           1       0.87      0.89      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

