In [3]:
# !pip install scikit-learn==1.5.2

In [4]:
# Import libraries
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Load the already preprocessed Sentiment140 dataset

# Define column names and data types
column_names = ['polarity', 'text']
column_dtypes = {
    'polarity': 'int64',
    'text': 'str'
}

# Load CSV with specified dtypes

data_path = '/kaggle/input/cleaned-stemmed-lemmatized-sentiment140-dataset/cleaned_stemmed_lemmatized_sentiment140.csv'

# Load CSV with specified dtypes, ensuring the header is skipped if it's present in the data
df = pd.read_csv(data_path, encoding='latin-1', header=None, names=column_names, dtype=column_dtypes, skiprows=1)

df.head()

Unnamed: 0,polarity,text
0,0,user url aww that bummer shoulda got david car...
1,0,upset cant updat facebook text might cri resul...
2,0,user dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,user behav im mad cant see


In [6]:
# df[df.text.isnull()].count()
df[df.text.isna()].count()

polarity    419
text          0
dtype: int64

In [7]:
null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
null_rows

Unnamed: 0,polarity,text
3997,0,
4233,0,
8131,0,
18950,0,
24802,0,
...,...,...
1592715,1,
1595775,1,
1596318,1,
1598921,1,


In [8]:
df.dropna(subset=['text'], inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1599581 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1599581 non-null  int64 
 1   text      1599581 non-null  object
dtypes: int64(1), object(1)
memory usage: 36.6+ MB


In [10]:
# Prepare text data and labels
X = df['text']
y = df['polarity']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
# vectorizer = CountVectorizer(max_features=5000) # Toggle between this and TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train Logistic Regression model
# model = LogisticRegression()
model = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
model.fit(X_train_vect, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_vect)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.78      0.76      0.77    159259
           1       0.77      0.79      0.78    160658

    accuracy                           0.78    319917
   macro avg       0.78      0.78      0.77    319917
weighted avg       0.78      0.78      0.77    319917



In [11]:
# Save the model and vectorizer
joblib.dump(model, 'logistic_regression_model.pkl')
joblib.dump(vectorizer, 'logistic_regression_vectorizer.pkl')

['logistic_regression_vectorizer.pkl']