In [1]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
data = pd.read_csv("/content/IMDB_Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [25]:
# shape of the data
data.shape

(50000, 2)

In [26]:
# count of unique values in the column
data['sentiment'].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [4]:
# Function to clean text data
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove text within square brackets (e.g., [example])
    text = re.sub('\[.*?\]', '', text)

    # Remove punctuation characters (e.g., ., !, ?, etc.)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # Remove words containing numbers (e.g., "text123" or "12text")
    text = re.sub('\w*\d\w*', '', text)

    # Remove specific characters such as single quotes, double quotes, and commas
    text = re.sub('[\'"",]', '', text)

    # Remove newline characters
    text = re.sub('\n', '', text)

    return text


In [5]:
data['review'] = data['review'].apply(clean_text)


In [6]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [7]:
# Splitting dataset
X = data['review'].values
y = data['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=225)


In [17]:
tf = TfidfVectorizer()
from sklearn.pipeline import Pipeline

# TF-IDF (Term Frequency-Inverse Document Frequency) is chosen over Bag of Words (BoW) because:
# 1. **Handles Word Importance**: TF-IDF assigns higher weights to important words that appear frequently in a document but not across all documents, reducing the impact of common words.
# 2. **Reduces the Influence of Stopwords**: Unlike BoW, which treats all words equally, TF-IDF reduces the importance of commonly occurring words like "the", "is", and "and".
# 3. **Sparsity Reduction**: TF-IDF gives non-zero weights to fewer words per document compared to BoW, making it more efficient for large datasets.
# 4. **Better for Text Similarity**: TF-IDF helps in distinguishing between documents by emphasizing unique terms, whereas BoW only counts occurrences.
# 5. **Improved Performance**: Models trained using TF-IDF often perform better as it captures the relevance of words instead of just their frequency.

In [18]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
model=Pipeline([('vectorizer',tf),('classifier',classifier)])

model.fit(X_train,y_train)

In [19]:
ypred=model.predict(X_test)

In [20]:
# model score
accuracy_score(ypred,y_test)

0.89344

In [21]:
# confusion matrix
A=confusion_matrix(y_test,ypred)
print(A)

[[5533  727]
 [ 605 5635]]


In [23]:
#evaluation
recall=A[0][0]/(A[0][0]+A[1][0])
print(recall)
precision=A[0][0]/(A[0][0]+A[0][1])
print(precision)
F1=2*recall*precision/(recall+precision)
print(F1)

0.9014336917562724
0.8838658146964856
0.892563316663978


In [24]:
print("Classification Report:")
print(classification_report(y_test, ypred))


Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      6260
    positive       0.89      0.90      0.89      6240

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500



Try Different Algorithms:

1. Experiment with advanced models like Random Forest, XGBoost, or deep learning models (LSTMs) that can better capture sentiment nuances.
2. Use ensemble learning by combining multiple models (e.g., Logistic Regression + Naive Bayes) to improve performance.