In [1]:
import pandas as pd  
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from preprocessor import DataPreprocessor

In [2]:
# Reading a CSV File 
df = pd.read_csv('./datasets/twitter_data.csv', encoding = 'latin', header = None)

In [3]:
# Naming the Columns
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']

# Dropping Irrelevant Columns
df = df.drop(['id', 'date', 'query', 'user_id'], axis = 1)

# Replacing the column values with new values
df['sentiment'] = df['sentiment'].replace({0:'Negative', 4:'Positive'})

In [4]:
# Shuffling the dataset randomly 

df = df.sample(frac=1, random_state=42)

In [5]:
# Splitting the data

X_train, X_test, Y_train, Y_test  = train_test_split(df['text'].values, df['sentiment'].values, test_size=0.2, random_state=42)

In [9]:
# Creating A NLP Pipeline

pipeline = Pipeline(steps=[('preprocessor', DataPreprocessor()), ('vectorizer', TfidfVectorizer()), ('model', LogisticRegression())])

In [17]:
import warnings
warnings.filterwarnings("ignore")

# Your code that produces warnings here

pipeline.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Predicting Results on the Test Data

Y_pred = pipeline.predict(X_test)

In [19]:
# Calucalate Accuracy

accuracy = accuracy_score(Y_pred, Y_test)

print(f'Accuracy: {accuracy}')
print('Classification Report:', classification_report(Y_pred, Y_test))


Accuracy: 0.77554375
Classification Report:               precision    recall  f1-score   support

    Negative       0.76      0.78      0.77    154082
    Positive       0.79      0.77      0.78    165918

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000



In [22]:
import joblib
joblib.dump(pipeline, './twt_pipeline')