In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline




In [93]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ym221\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [94]:
imdb = pd.read_csv('IMDB Dataset.csv')

In [95]:
imdb.head

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [96]:
imdb.tail

<bound method NDFrame.tail of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [97]:
# now we are going to clean and process the data
# Check for missing values
print(imdb.isnull().sum())

# Remove rows with missing values
imdb = imdb.dropna()

review       0
sentiment    0
dtype: int64


In [98]:
#text cleaning
import re

# Function to clean text
def clean_text(text):
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub('[^A-Za-z]+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the 'review' column
imdb['review'] = imdb['review'].apply(clean_text)


In [99]:
imdb.head

<bound method NDFrame.head of                                                   review sentiment
0      one of the other reviewers has mentioned that ...  positive
1      a wonderful little production the filming tech...  positive
2      i thought this was a wonderful way to spend ti...  positive
3      basically there s a family where a little boy ...  negative
4      petter mattei s love in the time of money is a...  positive
...                                                  ...       ...
49995  i thought this movie did a down right good job...  positive
49996  bad plot bad dialogue bad acting idiotic direc...  negative
49997  i am a catholic taught in parochial elementary...  negative
49998  i m going to have to disagree with the previou...  negative
49999  no one expects the star trek movies to be high...  negative

[50000 rows x 2 columns]>

In [100]:
#to check if it was clean
# Print a few samples of raw text before cleaning
print("Raw Text - Before Cleaning:")
for i in range(5):
    print(imdb['review'].iloc[i])
    print('-' * 50)

# Apply the cleaning function
imdb['cleaned_review'] = imdb['review'].apply(clean_text)

# Print the same samples after cleaning
print("\nCleaned Text:")
for i in range(5):
    print(imdb['cleaned_review'].iloc[i])
    print('-' * 50)


Raw Text - Before Cleaning:
one of the other reviewers has mentioned that after watching just oz episode you ll be hooked they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wo

In [101]:
#checking
# Print specific entries for detailed comparison
index_to_check = 10  # Change this index as needed
print("Raw Text:")
print(imdb['review'].iloc[index_to_check])
print('-' * 50)

print("\nCleaned Text:")
print(imdb['cleaned_review'].iloc[index_to_check])
print('-' * 50)


Raw Text:
phil the alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines at first it was very odd and pretty funny but as the movie progressed i didn t find the jokes or oddness funny anymore its a low budget film thats never a problem in itself there were some pretty interesting characters but eventually i just lost interest i imagine this film would appeal to a stoner who is currently partaking for something similar but better try brother from another planet 
--------------------------------------------------

Cleaned Text:
phil the alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines at first it was very odd and pretty funny but as the movie progressed i didn t find the jokes or oddness funny anymore its a low budget film thats never a problem in itself there were some pretty interesting characters but eventually i just lost interest i imagine 

In [102]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ym221\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [103]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)


In [104]:
import nltk

# Download the 'punkt' resource
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ym221\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [105]:
# Apply the function to the 'review' column
imdb['review'] = imdb['review'].apply(remove_stopwords)


In [106]:
#for checking
# Check a few samples of 'review' column before and after applying the function
print("Before Tokenization and Stopwords Removal:")
print(imdb['review'].head())

# Apply the function to the 'review' column
imdb['review'] = imdb['review'].apply(remove_stopwords)

# Check the same samples after applying the function
print("\nAfter Tokenization and Stopwords Removal:")
print(imdb['review'].head())


Before Tokenization and Stopwords Removal:
0    one reviewers mentioned watching oz episode ho...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically family little boy jake thinks zombie...
4    petter mattei love time money visually stunnin...
Name: review, dtype: object

After Tokenization and Stopwords Removal:
0    one reviewers mentioned watching oz episode ho...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically family little boy jake thinks zombie...
4    petter mattei love time money visually stunnin...
Name: review, dtype: object


In [107]:
print(imdb['sentiment'].isnull().sum())

0


In [108]:
imdb = imdb.dropna(subset=['sentiment'])

In [109]:
imdb['sentiment'] = imdb['sentiment'].fillna('unknown')


In [110]:
print(imdb['sentiment'].unique())


['positive' 'negative']


In [111]:
# Replace unexpected values with a default value (e.g., 'unknown')
imdb['sentiment'] = imdb['sentiment'].map({'positive': 1, 'negative': 0,})


In [112]:
imdb.head

<bound method NDFrame.head of                                                   review  sentiment  \
0      one reviewers mentioned watching oz episode ho...          1   
1      wonderful little production filming technique ...          1   
2      thought wonderful way spend time hot summer we...          1   
3      basically family little boy jake thinks zombie...          0   
4      petter mattei love time money visually stunnin...          1   
...                                                  ...        ...   
49995  thought movie right good job creative original...          1   
49996  bad plot bad dialogue bad acting idiotic direc...          0   
49997  catholic taught parochial elementary schools n...          0   
49998  going disagree previous comment side maltin on...          0   
49999  one expects star trek movies high art fans exp...          0   

                                          cleaned_review  
0      one of the other reviewers has mentioned that ...  

In [113]:
#split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(imdb['review'], imdb['sentiment'], test_size=0.2, random_state=42)


In [114]:
print(X_train,y_train)


39087    kept asking many fights screaming matches swea...
30893    watch entire movie could watch entire movie st...
45278    touching love story reminiscent mood love draw...
16398    latter day fulci schlocker totally abysmal con...
13653    first firmly believe norwegian movies continua...
                               ...                        
11284    shadow magic recaptures joy amazement first mo...
44732    found movie quite enjoyable fairly entertainin...
38158    avoid one terrible movie exciting pointless mu...
860      production quite surprise absolutely love obsc...
15795    decent movie although little bit short time pa...
Name: review, Length: 40000, dtype: object 39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=50000)  # You can adjust max_features based on your dataset

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Now, X_train_tfidf and X_test_tfidf contain the TF-IDF representations of the text data


In [116]:

# Train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8971
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [117]:
# Assuming you have a new dataset 'new_data' with a 'review' column
new_data = pd.read_csv('IMDB Dataset.csv')

# Preprocess the new data using the same TF-IDF vectorizer
new_data_tfidf = tfidf_vectorizer.transform(new_data['review'])

# Make predictions using the trained logistic regression model
new_data_predictions = model.predict(new_data_tfidf)

# Display the predictions
new_data['predicted_sentiment'] = new_data_predictions
print(new_data[['review', 'predicted_sentiment']])



                                                  review  predicted_sentiment
0      One of the other reviewers has mentioned that ...                    1
1      A wonderful little production. <br /><br />The...                    1
2      I thought this was a wonderful way to spend ti...                    1
3      Basically there's a family where a little boy ...                    0
4      Petter Mattei's "Love in the Time of Money" is...                    1
...                                                  ...                  ...
49995  I thought this movie did a down right good job...                    1
49996  Bad plot, bad dialogue, bad acting, idiotic di...                    0
49997  I am a Catholic taught in parochial elementary...                    1
49998  I'm going to have to disagree with the previou...                    0
49999  No one expects the Star Trek movies to be high...                    0

[50000 rows x 2 columns]


In [119]:
# now we can predict if the the review is positve or negative
# Function to preprocess and predict sentiment for a given text
def predict_sentiment(text):
    # Preprocess the text
    text_tfidf = tfidf_vectorizer.transform([text])
    
    # Make prediction
    prediction = model.predict(text_tfidf)
    
    # Print the result
    sentiment = "positive" if prediction[0] == 1 else "negative"
    print(f"Predicted sentiment for the text: '{text}' is {sentiment}")

# Example text samples
text_samples = [
    "I hsted the movie it was very boring",
    "The movie was terrible, a waste of time and money.",
    "It was an okay movie, not the best but not the worst either."
]

# Predict sentiment for each text sample
for text_sample in text_samples:
    predict_sentiment(text_sample)


Predicted sentiment for the text: 'I hsted the movie it was very boring' is negative
Predicted sentiment for the text: 'The movie was terrible, a waste of time and money.' is negative
Predicted sentiment for the text: 'It was an okay movie, not the best but not the worst either.' is negative


imdb.head