In [51]:
import pandas as pd
import numpy as np
import ast

In [52]:
data = pd.read_csv("news_dataset.csv")

In [53]:
data['text'][0]

'Payal has accused filmmaker Anurag Kashyap of behaving inappropriately with her in a video that went viral. She maintained her stance while speaking to ETimes and said, ‚ÄúI have wanted to speak about this for a long time. But today, finally I thought I must get it off my head. I had tweeted about my incident sometime ago when the #MeToo movement had happened, but many people told me to delete the tweet else I would stop getting work. My manager too advised me to remove the tweet. I complied. Post that, Anurag blocked me on WhatsApp.‚Äù'

In [54]:
data

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...
...,...,...
3724,REAL,19:17 (IST) Sep 20\n\nThe second round of coun...
3725,REAL,19:17 (IST) Sep 20\n\nThe second round of coun...
3726,FAKE,The Bengaluru City Police‚Äôs official Twitter h...
3727,REAL,"Sep 20, 2020, 08:00AM IST\n\nSource: TOI.in\n\..."


In [55]:
#checking fo rnull values 

data.isnull().sum()

label    0
text     8
dtype: int64

In [56]:
#dropping the null values 

data.dropna(inplace=True)

In [57]:
#spliting the data into 2 sets one for training and another for testing 

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [58]:
train_data

Unnamed: 0,label,text
209,FAKE,Several mainstream news outlets such as the Ti...
3374,REAL,NEW DELHI: The Drugs Controller General of Ind...
3540,FAKE,A screenshot purporting to be a newspaper clip...
2472,FAKE,A viral video of a woman with infant traveling...
2510,REAL,Read Also\n\nBe it winning hearts or winning t...
...,...,...
1133,REAL,"Kareena Kapoor Khan, who is all set to ring in..."
1297,REAL,ROME: Novak Djokovic knows it isn't model beha...
863,FAKE,Facebook screenshot\n\nFollowing Rashtriya Lok...
3515,REAL,NEW DELHI: The finance ministry on Saturday in...


In [59]:
test_data

Unnamed: 0,label,text
908,REAL,NEW DELHI: A final decision on Pakistan's stat...
3454,REAL,NEW DELHI: Seven of the top 10 most valued dom...
1790,REAL,"Kareena Kapoor Khan, who is all set to ring in..."
1167,FAKE,A photo purporting to show a television news g...
1605,FAKE,A disturbing video of a woman being flogged by...
...,...,...
1239,FAKE,A story claiming Barack and Michelle Obama hav...
2409,FAKE,A fake list claiming WikiLeaks has named 24 In...
1958,FAKE,CLAIM: Muslims rioted in UK's Birmingham durin...
2680,FAKE,"Maleeha Lodhi, Pakistan‚Äôs permanent representa..."


In [60]:
#converting the label column into numerical values

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_data['label'] = le.fit_transform(train_data['label'])
test_data['label'] = le.transform(test_data['label'])

In [61]:
# doing vectorization using bag of words 

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])
y_train = train_data['label']
y_test = test_data['label']

In [62]:
# training the model 

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [88]:
# testing the data weather the model is trained or not 

test_data[['label','text']].iloc[700]

label                                                    0
text     Congress leader and member of Parliament Shash...
Name: 1092, dtype: object

In [91]:
# Choose an index of the test_data
sample_index = 700

# Extract the sample text
sample_text = test_data.iloc[sample_index]['text']

# Vectorize the sample text (must be in list form)
sample_vector = vectorizer.transform([sample_text])

# Predict the class label (numerical)
predicted_label_num = model.predict(sample_vector)[0]

# Convert numerical label back to original label string
predicted_label = le.inverse_transform([predicted_label_num])[0]

print(f"Sample text: {sample_text}")
print(f"Predicted label: {predicted_label}")

Sample text: Congress leader and member of Parliament Shashi Tharoor recently tweeted an old picture, showing a woman holding an intravenous tube connected to a child's forearm as he defecates, to take a dig at the Atmanirbhar Bharat Abhiyan (Self Reliant India Mission) policy.

BOOM found that the photograph shared by Tharoor was clicked in 2017 by Amitesh Kumar, assistant producer with The Lallantop, a news portal.

The term Atmanirbhar first caught social media attention after Prime Minister Narendra Modi used it while announcing the Coronavirus pandemic related economic package on May 12 amid the lockdown to stem the spread of the virus.

Also read Jhansi Police Turn Ghostbusters As Clipped Video Spooks Netizens

A Hindi caption with the viral tweet translates to 'We are experts in facing any disaster, this is self-reliant India, this is #selfreliant India'.

(Hindi: ‡§π‡§∞ ‡§µ‡§ø‡§™‡§¶‡§æ ‡§∏‡•á ‡§≤‡§°‡§º‡§®‡•á ‡§ï‡•Ä, ‡§π‡§Æ ‡§Æ‡•á‡§Ç ‡§¨‡§°‡§º‡•Ä ‡§Æ‡§π‡§æ‡§∞‡§§ ‡§π‡•à, ‡§Ø‡•á ‡

In [92]:
import pickle 

In [94]:
#making pickle dump 

pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
pickle.dump(le,open('le.pkl','wb'))