In [None]:

import requests

url = "https://media.geeksforgeeks.org/wp-content/uploads/20240514105101/IMDB-Dataset.csv"
response = requests.get(url)

if response.status_code == 200:
    # Save the content to a file
    with open('IMDB-Dataset.csv', 'wb') as f:
        f.write(response.content)
    print("File downloaded successfully.")
else:
    print(f"Failed to download file. Status code: {response.status_code}")



File downloaded successfully.


In [None]:
import pandas as pd
df=pd.read_csv('IMDB-Dataset.csv')


In [None]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [None]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
df.shape

(50000, 2)

In [None]:
#handling missing values
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
df.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
df_positive=df[df['sentiment']=='positive'][:2000]
df_negative=df[df['sentiment']=='negative'][:2000]


In [None]:
df2=pd.concat([df_positive,df_negative],axis=0)

In [None]:
df2.shape

(4000, 2)

In [None]:
df2.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive


#Text preprocessing

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")


In [None]:
def lemmatization(text):
  doc=nlp(text)
  lemmaList=[word.lemma_ for word in doc]
  return ' '.join(lemmaList)

In [None]:
df2['lemma']=df2['review'].apply(lemmatization)

KeyboardInterrupt: 

In [None]:
df2.head()

In [None]:
def remove_stopwords(text):
  doc=nlp(text)
  no_stopwords=[word.text for word in doc if not word.is_stop and not word.is_punct]
  return ' '.join(no_stopwords)


In [None]:
df2['final']=df2['lemma'].apply(remove_stopwords)

In [None]:
df2.drop(columns=['lemma','review'],inplace=True)

In [None]:
df2.head()

In [None]:
new_csv=df2.to_csv('data_after_lemmatizaion_and_preprocessing',index=False)

In [6]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [129]:
df_new=pd.read_csv('data_after_lemmatizaion_and_preprocessing.csv')

#TFIDVECTORIZER
  

In [130]:
tfidf=TfidfVectorizer()

In [131]:
X=df_new['final']
y=df_new['sentiment']

In [132]:
tfidf.fit(X)

In [133]:
tfidf_matrix=tfidf.transform(X).toarray()

In [134]:
tfidf_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(4000, 29808))

In [135]:
vectorizer_df=pd.DataFrame(tfidf_matrix,columns=tfidf.get_feature_names_out())

In [136]:
vectorizer_df

Unnamed: 0,00,000,007,00am,01,01pm,02,04,05,06,...,élan,émigré,émigrés,être,ís,ísnt,île,ïn,önsjön,überwoman
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
X.shape,vectorizer_df.shape

((4000,), (4000, 29808))

In [138]:
X=pd.concat([X,vectorizer_df],axis=1)

In [139]:
X.head()

Unnamed: 0,final,00,000,007,00am,01,01pm,02,04,05,...,élan,émigré,émigrés,être,ís,ísnt,île,ïn,önsjön,überwoman
0,reviewer mention watch 1 Oz episode hook right...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,wonderful little production < br /><br />the f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,think wonderful way spend time hot summer week...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Petter Mattei love Time money visually stunnin...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,probably time favorite movie story selflessnes...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##train-test split

In [140]:
X.drop(columns=['final'], inplace=True)

X['new_column'] = 0

In [141]:
X.head()

Unnamed: 0,00,000,007,00am,01,01pm,02,04,05,06,...,émigré,émigrés,être,ís,ísnt,île,ïn,önsjön,überwoman,new_column
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [142]:
X.shape,vectorizer_df.shape

((4000, 29808), (4000, 29808))

In [143]:
tfidf.get_feature_names_out().shape

(29808,)

In [144]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [145]:
X_train.shape,X_test.shape

((3200, 29808), (800, 29808))

#Building model

In [146]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.87
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       378
           1       0.87      0.89      0.88       422

    accuracy                           0.87       800
   macro avg       0.87      0.87      0.87       800
weighted avg       0.87      0.87      0.87       800



In [147]:
import joblib

# After training
joblib.dump(model, 'model.pkl')
joblib.dump(tfidf, 'tfidf.pkl')


['tfidf.pkl']

In [148]:
pip install streamlit




In [149]:
import streamlit as st
import spacy
import joblib

# Load model and vectorizer
model = joblib.load('C:/Users/Nihal/Desktop/Sentimental_Analysis_Tool/model.pkl')
tfidf = joblib.load('C:/Users/Nihal/Desktop/Sentimental_Analysis_Tool/tfidf.pkl')
nlp = spacy.load("en_core_web_sm")

# Custom tokenizer
def spacy_tokenizer(text):
    doc = nlp(text)
    return [
        token.lemma_.lower() for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]

# UI
st.title("🧠 Sentiment Analysis Tool")
st.write("Type your review and get the predicted sentiment!")

# Text input
user_input = st.text_area("Enter your text here")

if st.button("Analyze Sentiment"):
    if user_input.strip() == "":
        st.warning("Please enter some text.")
    else:
        # Transform input
        input_vec = tfidf.transform([user_input])
        prediction = model.predict(input_vec)[0]
        sentiment = "Positive 😊" if prediction == 1 else "Negative 😞"
        st.success(f"Predicted Sentiment: **{sentiment}**")




In [150]:
!streamlit run C:\Users\Nihal\Desktop\Sentimental_Analysis_Tool\Sentimental_analysis_tool.ipynb

Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Streamlit requires raw Python (.py) files, not .ipynb.
For more information, please see https://docs.streamlit.io


In [151]:
print(tfidf.get_feature_names_out().shape)
print(model.coef_.shape)


(29808,)
(1, 29808)
