In [63]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/utkarshpadia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [64]:
#loading the dataset for training the model
tweets_data = pd.read_csv("Twitter_Data.csv")


In [65]:
tweets_data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [66]:
tweets_data['category'].dtype

dtype('float64')

In [67]:
tweets_data['clean_text'].isnull().sum()

4

In [68]:
tweets_data['category'].isnull().sum()

7

In [69]:
tweets_data.shape

(162980, 2)

In [70]:
tweets_data.dropna(axis =0 ,inplace = True)

In [71]:
tweets_data['category'].isnull().sum()

0

In [72]:
tweets_data['clean_text'].isnull().sum()

0

In [73]:
tweets_data.shape

(162969, 2)

In [74]:
X = tweets_data['clean_text']
Y = tweets_data['category']

In [75]:
#splitting the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3,random_state = 4)


In [76]:
#processing the tweets
def tokenize(text):
    '''
    
    This function processes the text by removing stopwords, tokenizing the sentences and lemmatizing words to 
    make the text usable for further transformation 
    
    Input:
    text - a string(tweet) which is the input of the model
    
    Output:
    returns the input text as clean tokens in a list
    
    
    '''
    
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokenized_text = [lemmatizer.lemmatize(token).strip() for token in tokens]
    return tokenized_text


In [77]:
#Defining the pipeline for transforming the data and classifying it 
nlp_pipeline = Pipeline([('vect',CountVectorizer(tokenizer = tokenize)),
                        ('tran',TfidfTransformer()),
                        ('clf',RandomForestClassifier())])
#training the pipeline
nlp_pipeline.fit(x_train,y_train)
#making predictions on the test data
y_pred = nlp_pipeline.predict(x_test)

In [103]:
y_pred[0] == y_test.iloc[0]

False

In [104]:
#Analyse the accuracy of the model
def check_accuracy(pred_values,real_values):
    '''
    
    This function checks the accuracy by comparing the pred_values and real_values
    
    Input:
    pred_values - array of predicted classification values from the pipeline
    real_values - array of the actual classification values of the input
    
    Output:
    The function doesn't return anything but prints the comparison results
    
    '''
    
    print(classification_report(real_values, pred_values))
    

In [105]:
#Analysing the accuracy of nlp_pipeline model
check_accuracy(y_pred,y_test)

              precision    recall  f1-score   support

        -1.0       0.89      0.58      0.70     10678
         0.0       0.83      0.91      0.87     16657
         1.0       0.81      0.89      0.85     21556

    accuracy                           0.83     48891
   macro avg       0.84      0.79      0.81     48891
weighted avg       0.84      0.83      0.82     48891



In [110]:
#Analysing parameters to improve the accuracy
RandomForestClassifier().get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [113]:
#Defining hyperparameters for tuning
parameters = {
    'clf__criterion':['gini', 'entropy'],
    'clf__max_features':['sqrt','log2']
}

In [114]:
#Defining the pipeline and tuned model
nlp_pipeline = Pipeline([('vect',CountVectorizer(tokenizer = tokenize)),
                        ('tran',TfidfTransformer()),
                        ('clf',RandomForestClassifier())])
tuned_model = GridSearchCV(nlp_pipeline,parameters,n_jobs=1)
tuned_model.fit(x_train,y_train)
y_pred_tuned = tuned_model.predict(x_test)


10 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/utkarshpadia/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/utkarshpadia/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/utkarshpadia/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/Users/utkarshpadia/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py

In [115]:
check_accuracy(y_pred_tuned,y_test)

              precision    recall  f1-score   support

        -1.0       0.89      0.58      0.70     10678
         0.0       0.83      0.92      0.87     16657
         1.0       0.82      0.89      0.85     21556

    accuracy                           0.83     48891
   macro avg       0.84      0.80      0.81     48891
weighted avg       0.84      0.83      0.83     48891



In [117]:
#Saving the model as a pickle file
pickle.dump(tuned_model, open('model.pkl', 'wb'))