In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
Data=pd.read_csv("final_combined_enriched_v4.csv")
Data

Unnamed: 0,tweet,type
0,This cartoon by Alok @caricatured speak about...,1
1,I am going to post one #BernieAtTheFarmersProt...,1
2,@sardesairajdeep @OfficialUrmila @RahulGandhi ...,1
3,Oooh... that`s right by the zoo... think... i...,0
4,@vivekoberoi @narendramodi @OmungKumar @suresh...,1
...,...,...
50164,ROYALICA Women Black Georgette Anarkali Kurta ...,0
50165,#Latamangeshkar is one of the star campaigner...,1
50166,Om shakthi-sadashiv Om sri-vishnu Jai ho modi...,1
50167,@RahulGandhi @RahulGandhi Shame on you and yo...,1


In [3]:
Data.head()

Unnamed: 0,tweet,type
0,This cartoon by Alok @caricatured speak about...,1
1,I am going to post one #BernieAtTheFarmersProt...,1
2,@sardesairajdeep @OfficialUrmila @RahulGandhi ...,1
3,Oooh... that`s right by the zoo... think... i...,0
4,@vivekoberoi @narendramodi @OmungKumar @suresh...,1


In [4]:
Data.tail()

Unnamed: 0,tweet,type
50164,ROYALICA Women Black Georgette Anarkali Kurta ...,0
50165,#Latamangeshkar is one of the star campaigner...,1
50166,Om shakthi-sadashiv Om sri-vishnu Jai ho modi...,1
50167,@RahulGandhi @RahulGandhi Shame on you and yo...,1
50168,it can happen only in #india,0


In [5]:
Data.shape

(50169, 2)

In [6]:
Data["type"].value_counts()

type
1    27459
0    22710
Name: count, dtype: int64

## Check missing values

In [7]:
Data.isna().sum()

tweet    0
type     0
dtype: int64

## Very important (Text cleaning)

In [8]:
## WE clean
#1. lowercase
#2. remove URls
#3. remove mentions & hastags
#4. remove punctuation
#5. remove stopwords

In [9]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https\S+', '', text)   ## remove urls
    text = re.sub(r'@\w+', '', text)  ## remove mentions
    text = re.sub(r'#\w+', '', text)  ## remove hastags
    text = re.sub(r'[^a-z\s]', '', text)  ## remove punctuations

    text = " ".join([word for word in text.split() if word not in stop_words])

    return text

Data['clean_tweet']=Data['tweet'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinot\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## step 4   Train test split

In [10]:
from sklearn.model_selection import train_test_split

X=Data['clean_tweet']
y=Data['type']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

## step 5 Convert text to numbers(TF-IDF)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(
             max_features=5000,
             ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


## step 6 Train Multiple Models

In [12]:
## 1. Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_vec, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [13]:
## 2. Navie Bayes

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [14]:
## 3. Support Vector Machine(SVM)

from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(X_train_vec, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


## Step 7 Evaluate Models

In [15]:
from sklearn.metrics import accuracy_score,classification_report

def evaluate(model):
    y_pred = model.predict(X_test_vec)
    print("Accuracy:",accuracy_score(y_test,y_pred))

    
    print(classification_report(y_test,y_pred))
print("Logistic Regression")
evaluate(lr)

print("Navie Bayes")
evaluate(nb)

print("Support Vector Machine")
evaluate(svm)

Logistic Regression
Accuracy: 0.9104046242774566
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      4542
           1       0.94      0.89      0.92      5492

    accuracy                           0.91     10034
   macro avg       0.91      0.91      0.91     10034
weighted avg       0.91      0.91      0.91     10034

Navie Bayes
Accuracy: 0.8690452461630457
              precision    recall  f1-score   support

           0       0.92      0.78      0.84      4542
           1       0.84      0.94      0.89      5492

    accuracy                           0.87     10034
   macro avg       0.88      0.86      0.87     10034
weighted avg       0.87      0.87      0.87     10034

Support Vector Machine
Accuracy: 0.9065178393462229
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      4542
           1       0.93      0.89      0.91      5492

    accuracy                           0

##  Select Highest accuray and f1 score

##  Step 8 Hyperparameter Tuning

In [16]:
from sklearn.model_selection import GridSearchCV

params = {'C':[0.1,1,10]}

grid = GridSearchCV(
    LogisticRegression(),
    params,
    cv=5,
    scoring="f1")

grid.fit(X_train_vec,y_train)
grid.best_params_

{'C': 10}

In [17]:
## Train best model

best_model = grid.best_estimator_

## Step 9 Save Model

In [18]:
import joblib
joblib.dump(best_model,"political_model.pkl")
joblib.dump(tfidf,"tfidf.pkl")

['tfidf.pkl']

## Step 10  Predict New sentence

In [19]:
model = joblib.load("political_model.pkl")
vectorizer = joblib.load("tfidf.pkl")

def predict_sentence(text):
    text = clean_text(text)
    vec = vectorizer.transform([text])
    result = model.predict(vec)[0]

    return "Political"  if result == 1 else "Non-Political"

predict_sentence("The decision sparked debates across universities and workplaces") ## model is successfully  predicted !!!!!

'Political'

# Deep Learning 

### Train test split

In [20]:
train_texts, test_texts, train_labels, test_labels=train_test_split(
    Data['tweet'].tolist(),
    Data['type'].tolist(),
    test_size=0.2,
    random_state=42)

## Load BERT Tokenizer

In [21]:
import torch

from transformers import BertForSequenceClassification
from transformers import Trainer,TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm





In [22]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

## Tokenization

In [23]:
train_encodings = tokenizer(
    train_texts,truncation=True,
    padding=True,
    max_length=128)

test_encodings = tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=128)

## Create Torch Dataset

In [24]:
class TweetDataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
test_dataset = TweetDataset(test_encodings, test_labels)

# Load Pretrained BERT Model

In [25]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training Configuration

In [26]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs")

# Trainer

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset)

# Train model

In [None]:
trainer.train()   ##take time 15-40minutes (GPU faster)

  super().__init__(loader)


Step,Training Loss


# Evaluate Model

In [None]:
trainer.evaluate()