In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import re

In [2]:
df=pd.read_csv('train.csv')
df.head()


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [4]:
df.isnull()

Unnamed: 0,id,title,author,text,label
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
20795,False,False,False,False,False
20796,False,False,False,False,False
20797,False,False,False,False,False
20798,False,False,False,False,False


In [5]:
df = df.drop([16921], axis=0)

In [6]:
features = df.iloc[:, 3].values
labels = df.iloc[:, 4].values

In [7]:
features

array(['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) \nWith apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. \nAs we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewi

In [8]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [9]:
processed_features

['house dem aide we didn even see comey letter until jason chaffetz tweeted it by darrell lucus on october 30 2016 subscribe jason chaffetz on the stump in american fork utah image courtesy michael jolley available under creative commons by license with apologies to keith olbermann there is no doubt who the worst person in the world is this week fbi director james comey but according to house democratic aide it looks like we also know who the second worst person is as well it turns out that when comey sent his now infamous letter announcing that the fbi was looking into emails that may be related to hillary clinton email server the ranking democrats on the relevant committees didn hear about it from comey they found out via tweet from one of the republican committee chairmen as we now know comey notified the republican chairmen and democratic ranking members of the house intelligence judiciary and oversight committees that his agency was reviewing emails it had recently discovered in o

In [10]:
labels

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [11]:
featur = pd.DataFrame(data=processed_features, columns=["texts"])

In [12]:
featur

Unnamed: 0,texts
0,house dem aide we didn even see comey letter u...
1,ever get the feeling your life circles the rou...
2,why the truth might get you fired october 29 2...
3,videos 15 civilians killed in single us airstr...
4,print an iranian woman has been sentenced to s...
...,...
20794,rapper i unloaded on black celebrities who met...
20795,when the green bay packers lost to the washing...
20796,the macy of today grew from the union of sever...
20797,nato russia to hold parallel exercises in balk...


In [13]:
lab=pd.DataFrame(data=labels,columns=["labels"])

In [14]:
lab

Unnamed: 0,labels
0,1
1,0
2,1
3,1
4,1
...,...
20794,0
20795,0
20796,0
20797,1


In [15]:
x_train,x_test,y_train,y_test=train_test_split(featur["texts"],lab["labels"],test_size=0.2,random_state=7)

In [16]:
x_train

14106    november 21 2016 fort russ news rianovosti tra...
13403    the leads have dried up in the killing of youn...
14676    on the monday edition of breitbart news daily ...
609      the international community should not only ac...
5997     disgraced former new york congressman anthony ...
                               ...                        
919      andrew anglin 26 2016 what blacks think black ...
20691     reuters russian president vladimir putin has ...
5699     the family of an illegal immigrant arrested du...
10742    rio de janeiro the highest court for world spo...
16921    am missing something here take it to be saying...
Name: texts, Length: 16639, dtype: object

In [17]:
tf_v=TfidfVectorizer(max_features=2500,stop_words='english',max_df=0.7)

In [18]:
tf_train=tf_v.fit_transform(x_train)
tf_test=tf_v.transform(x_test)

In [19]:
tf_train.shape

(16639, 2500)

In [20]:
pac=PassiveAggressiveClassifier(max_iter=50,verbose=1)
pac.fit(tf_train,y_train)
y_pred=pac.predict(tf_test)
score=accuracy_score(y_test,y_pred)
print(score)

-- Epoch 1
Norm: 48.31, NNZs: 2500, Bias: 1.026535, T: 16519, Avg. loss: 0.210328
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 59.40, NNZs: 2500, Bias: 1.300425, T: 33038, Avg. loss: 0.119588
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 67.86, NNZs: 2500, Bias: 1.416745, T: 49557, Avg. loss: 0.097765
Total training time: 0.04 seconds.
-- Epoch 4
Norm: 74.70, NNZs: 2500, Bias: 1.633066, T: 66076, Avg. loss: 0.087357
Total training time: 0.05 seconds.
-- Epoch 5
Norm: 80.98, NNZs: 2500, Bias: 1.628434, T: 82595, Avg. loss: 0.077491
Total training time: 0.06 seconds.
-- Epoch 6
Norm: 86.28, NNZs: 2500, Bias: 1.696590, T: 99114, Avg. loss: 0.070603
Total training time: 0.07 seconds.
-- Epoch 7
Norm: 91.13, NNZs: 2500, Bias: 1.766160, T: 115633, Avg. loss: 0.066560
Total training time: 0.08 seconds.
-- Epoch 8
Norm: 95.84, NNZs: 2500, Bias: 1.781072, T: 132152, Avg. loss: 0.062512
Total training time: 0.09 seconds.
-- Epoch 9
Norm: 99.88, NNZs: 2500, Bias: 1.915908, T: 148671,

In [21]:
confusion_matrix(y_test,y_pred,labels=[1,0])


array([[1946,  145],
       [ 163, 1906]], dtype=int64)

In [22]:
test_data=pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [23]:
validate_data=pd.read_csv('submit.csv')
validate_data.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [24]:
test_data.describe()

Unnamed: 0,id
count,5200.0
mean,23399.5
std,1501.255031
min,20800.0
25%,22099.75
50%,23399.5
75%,24699.25
max,25999.0


In [25]:
features_test = test_data.iloc[:, 3].values
labels_test = validate_data.iloc[:, 1].values

In [26]:
features_test

array(['PALO ALTO, Calif.  —   After years of scorning the political process, Silicon Valley has leapt into the fray. The prospect of a President Donald J. Trump is pushing the tech community to move beyond its traditional role as donors and to embrace a new existence as agitators and activists. A distinguished venture capital firm emblazoned on its corporate home page an earthy   epithet. One prominent tech chieftain says the consequences of Mr. Trump’s election would “range between disastrous and terrible. ” Another compares him to a dictator. And nearly 150 tech leaders signed an open letter decrying Mr. Trump and his campaign of “anger” and “bigotry. ” Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave

In [27]:
labels_test

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [28]:
processed_features_test = []

for sentence in range(0, len(features_test)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features_test[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature= re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature= re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature= re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature= processed_feature.lower()

    processed_features_test.append(processed_feature)

In [29]:
processed_features_test

['palo alto calif after years of scorning the political process silicon valley has leapt into the fray the prospect of president donald trump is pushing the tech community to move beyond its traditional role as donors and to embrace new existence as agitators and activists distinguished venture capital firm emblazoned on its corporate home page an earthy epithet one prominent tech chieftain says the consequences of mr trump election would range between disastrous and terrible another compares him to dictator and nearly 150 tech leaders signed an open letter decrying mr trump and his campaign of anger and bigotry not quite all the action is peter thiel founder of paypal and palantir who was the first outside investor in facebook spoke at the republican convention in july the new york times reported on saturday that mr thiel is giving 1 25 million to support mr trump candidacy even as other supporters flee he also recently gave 1 million to super pac that supports senator rob portman the

In [30]:
featur_test = pd.DataFrame(data=processed_features_test, columns=["texts"])

In [31]:
featur_test

Unnamed: 0,texts
0,palo alto calif after years of scorning the po...
1,russian warships ready to strike terrorists ne...
2,videos nodapl native american leaders vow to s...
3,if at first you don succeed try different spor...
4,42 mins ago 1 views 0 comments 0 likes for the...
...,...
5195,of all the dysfunctions that plague the world ...
5196,washington gov john kasich of ohio on tuesday ...
5197,good morning want to get california today by e...
5198,previous next 300 us marines to be deployed t...


In [32]:
lab_test=pd.DataFrame(data=labels_test,columns=["labels_test"])

In [33]:
lab_test

Unnamed: 0,labels_test
0,0
1,1
2,0
3,1
4,1
...,...
5195,0
5196,1
5197,0
5198,1


In [34]:

tf_test_val=tf_v.transform(featur_test['texts'])

In [35]:
y_pred_val=pac.predict(tf_test_val)
score=accuracy_score(labels_test,y_pred_val)
print(score)

0.6303846153846154


In [36]:
confusion_matrix(lab_test['labels_test'],y_pred_val,labels=[1,0])

array([[1770, 1091],
       [ 831, 1508]], dtype=int64)

In [37]:
y_pred_val

array([0, 1, 1, ..., 0, 1, 0], dtype=int64)

In [38]:
featur_test['labels']=lab_test.values

In [39]:
featur_test

Unnamed: 0,texts,labels
0,palo alto calif after years of scorning the po...,0
1,russian warships ready to strike terrorists ne...,1
2,videos nodapl native american leaders vow to s...,0
3,if at first you don succeed try different spor...,1
4,42 mins ago 1 views 0 comments 0 likes for the...,1
...,...,...
5195,of all the dysfunctions that plague the world ...,0
5196,washington gov john kasich of ohio on tuesday ...,1
5197,good morning want to get california today by e...,0
5198,previous next 300 us marines to be deployed t...,1


In [40]:
featur_test['predicted']=y_pred_val

In [41]:
featur_test

Unnamed: 0,texts,labels,predicted
0,palo alto calif after years of scorning the po...,0,0
1,russian warships ready to strike terrorists ne...,1,1
2,videos nodapl native american leaders vow to s...,0,1
3,if at first you don succeed try different spor...,1,0
4,42 mins ago 1 views 0 comments 0 likes for the...,1,1
...,...,...,...
5195,of all the dysfunctions that plague the world ...,0,0
5196,washington gov john kasich of ohio on tuesday ...,1,0
5197,good morning want to get california today by e...,0,0
5198,previous next 300 us marines to be deployed t...,1,1


In [42]:
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows',20)

In [43]:
featur_test

Unnamed: 0,texts,labels,predicted
0,palo alto calif after years of scorning the po...,0,0
1,russian warships ready to strike terrorists ne...,1,1
2,videos nodapl native american leaders vow to s...,0,1
3,if at first you don succeed try different spor...,1,0
4,42 mins ago 1 views 0 comments 0 likes for the...,1,1
...,...,...,...
5195,of all the dysfunctions that plague the world ...,0,0
5196,washington gov john kasich of ohio on tuesday ...,1,0
5197,good morning want to get california today by e...,0,0
5198,previous next 300 us marines to be deployed t...,1,1


In [44]:
lab_test['labels_test']

0       0
1       1
2       0
3       1
4       1
       ..
5195    0
5196    1
5197    0
5198    1
5199    0
Name: labels_test, Length: 5200, dtype: int64

In [45]:
submit_file=pd.DataFrame(data=validate_data.iloc[:,0].values,columns=["id"])

In [46]:
submit_file

Unnamed: 0,id
0,20800
1,20801
2,20802
3,20803
4,20804
...,...
5195,25995
5196,25996
5197,25997
5198,25998


In [47]:
submit_file["label"]=featur_test.iloc[:,2].values

In [48]:
submit_file

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1
...,...,...
5195,25995,0
5196,25996,0
5197,25997,0
5198,25998,1


In [51]:
submit_file.to_csv('fake_new_submit.csv',index=False) 