Importing necessary library

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [21]:
## Read the dataset, convert it into dataframe
dataframe = pd.read_csv('news.csv')
dataframe.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [22]:
## Split data into X & Y
x = dataframe['text']
y = dataframe['label']
print(x)
print(y)

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object
0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6330    REAL
6331    FAKE
6332    FAKE
6333    REAL
6334    REAL
Name: label, Length: 6335, dtype: object


In [23]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)


In [24]:
## Now we need to fit the TFIDF Vectorizer.
# max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
# max_df = 25 means "ignore terms that appear in more than 25 documents".


In [25]:
tfvect = TfidfVectorizer(stop_words='english',max_df=0.7)

In [26]:
tfid_x_train = tfvect.fit_transform(x_train)

In [27]:
tfid_x_test = tfvect.transform(x_test)


In [28]:
## Now let's fit the Machine Learning Model
classifier = PassiveAggressiveClassifier(max_iter=50)

In [29]:
classifier.fit(tfid_x_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [30]:
## Now let's check model accuracy. Let's fit model on the test data.


In [31]:
y_pred = classifier.predict(tfid_x_test)

In [32]:
score = accuracy_score(y_test,y_pred)

In [33]:
cf = confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

In [34]:
print(cf)

# [[575  40]
# [ 40 612]]

[[571  44]
 [ 40 612]]


In [35]:
## Let's create function for test the model on the real-time data.

def fake_news_det(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = classifier.predict(vectorized_input_data)
    print(prediction)

In [36]:
fake_news_det("""Go to Article President Barack Obama has been 
campaigning hard for the woman who is supposedly going to extend his legacy 
four more years. The only problem with stumping for Hillary Clinton, however, 
is sheâ€™s not exactly a candidate easy to get too enthused about.  """)
# ['FAKE']

['FAKE']


In [37]:
# Let's save the model into pickle file so that we can use it in flask app.
import pickle


In [38]:
pickle.dump(classifier,open('model.pkl', 'wb'))