In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib as plt

In [39]:
df = pd.read_csv('sentiment.csv', encoding= 'latin-1')
df.columns
df.head(10)
# df.info() 
# df.size

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0
5,726e501993,that`s great!! weee!! visitors!,positive,night,70-100,Antigua and Barbuda,97929.0,440.0,223.0
6,261932614e,I THINK EVERYONE HATES ME ON HERE lol,negative,morning,0-20,Argentina,45195774.0,2736690.0,17.0
7,afa11da83f,"soooooo wish i could, but im in school and my...",negative,noon,21-30,Armenia,2963243.0,28470.0,104.0
8,e64208b4ef,and within a short time of the last clue all ...,neutral,night,31-45,Australia,25499884.0,7682300.0,3.0
9,37bcad24ca,What did you get? My day is alright.. haven`...,neutral,morning,46-60,Austria,9006398.0,82400.0,109.0


In [25]:
# feature engineering
df['text'] = df['text'].fillna("").astype(str)

In [26]:
# Features and labels
X = df['text']
y = df['sentiment']

In [36]:
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) 

# Convert textual/categorical data into numerical data usinf tfidvectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_vect = vectorizer.fit_transform(X)

In [28]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_vect, y_encoded, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

In [23]:

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.34      0.47       226
           1       0.51      0.57      0.54       268
           2       0.72      0.54      0.62       211
           3       0.64      1.00      0.78       258

    accuracy                           0.62       963
   macro avg       0.66      0.61      0.60       963
weighted avg       0.65      0.62      0.60       963



In [None]:
new_texts = [
    "I absolutely love this product! It's amazing.",
    "bad, but i liked it.",
    "it is good"
]
# FilterING out empty strings
new_texts = [text if text.strip() != "" else "No text provided" for text in new_texts]
new_texts_vect = vectorizer.transform(new_texts) 
new_sentiments = model.predict(new_texts_vect)
# Decode the predicted sentiments back to their original labels
decoded_sentiments = label_encoder.inverse_transform(new_sentiments)
# Display
for text, sentiment in zip(new_texts, decoded_sentiments):
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")

Text: I absolutely love this product! It's amazing.
Predicted Sentiment: positive

Text: bad, but i liked it.
Predicted Sentiment: negative

Text: it is good
Predicted Sentiment: positive

