Reference: https://www.datacamp.com/community/tutorials/scikit-learn-fake-news

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import basic libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
# Read in CSV files
fake_df = pd.DataFrame(pd.read_csv('../Resources/Fake.csv'))
true_df = pd.DataFrame(pd.read_csv('../Resources/True.csv'))

# Add true/false column
true_df['category'] = 'TRUE'
fake_df['category'] = 'FALSE'

# Append the DFs
all_news_df = true_df.append(fake_df).reset_index()

# Print shape and display preview
all_news_df.shape
all_news_df.head()

Unnamed: 0,index,title,text,subject,date,category
0,0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [4]:
# ML libraries
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [6]:
# Drop the `label` column

X = all_news_df[['title','text']]
y = all_news_df[['category']]

In [12]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(X.shape, y.shape)

(44898, 2) (44898, 1)


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [11]:
# Get the feature names of `tfidf_vectorizer` 
print(tfidf_vectorizer.get_feature_names()[-10:])

# Get the feature names of `count_vectorizer` 
print(count_vectorizer.get_feature_names()[:10])

['text', 'title']
['text', 'title']
