## This is groups project for CAPP 30254: Machine Learning for Public Policy
## Done by: Big Brother Debunkers ##
**Team Member: Dingwei Liu, Qi Zhao**

**Topic: Fake News Detection**

In [1]:
# setting up the environment
import numpy as np # linear algebra
import pandas as pd # data processing
pd.set_option('display.max_colwidth', 100)
import re  # regular expression
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Natural Language Toolkit
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize

# TF-IDF (Term Frequency-Inverse Document Frequency) 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.svm import SVC # Support Vector Classifier (SVC) 

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd # data processing


## Data Pre-processing and exploration

In [2]:
# Importing the dataset
data = pd.read_csv("train.csv", encoding = "UTF-8")

# check the first 5 rows of the dataset
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Luc...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart",Daniel J. Flynn,Ever get the feeling your life circles the roundabout rather than heads in a straight line towar...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, 2016 \nThe tension between intelligence analysts a...",1
3,3,15 Civilians Killed In Single US Airstrike Have Been Identified,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstrike Have Been Identified The rate at which civilia...,1
4,4,Iranian woman jailed for fictional unpublished story about woman stoned to death for adultery,Howard Portnoy,Print \nAn Iranian woman has been sentenced to six years in prison after Iran’s Revolutionary Gu...,1


In [3]:
# explore the title column
print(data['title'].head(10))

0                      House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It
1                                                FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart
2                                                                      Why the Truth Might Get You Fired
3                                        15 Civilians Killed In Single US Airstrike Have Been Identified
4          Iranian woman jailed for fictional unpublished story about woman stoned to death for adultery
5    Jackie Mason: Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (...
6    Life: Life Of Luxury: Elton John’s 6 Favorite Shark Pictures To Stare At During Long, Transconti...
7                Benoît Hamon Wins French Socialist Party’s Presidential Nomination - The New York Times
8    Excerpts From a Draft Script for Donald Trump’s Q&ampA With a Black Church’s Pastor - The New Yo...
9          A Back-Channel Plan for Ukraine and Russia, 

In [4]:
# Split the 'title' column on the hyphen and expand to multiple columns
split_data = data['title'].str.split(' - ', expand=True)

# Assign the first part back to the 'title' column (if needed)
data['title'] = split_data[0]

# Create the new 'source' column from the second part of the split
data['source'] = split_data[1]

# Display the DataFrame to check the new 'source' column
print(data[['title', 'source']].head(10))

                                                                                                 title  \
0                    House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It   
1                                                          FLYNN: Hillary Clinton, Big Woman on Campus   
2                                                                    Why the Truth Might Get You Fired   
3                                      15 Civilians Killed In Single US Airstrike Have Been Identified   
4        Iranian woman jailed for fictional unpublished story about woman stoned to death for adultery   
5  Jackie Mason: Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (...   
6  Life: Life Of Luxury: Elton John’s 6 Favorite Shark Pictures To Stare At During Long, Transconti...   
7                                   Benoît Hamon Wins French Socialist Party’s Presidential Nomination   
8                  Excerpts From a Draft Scrip

In [5]:
# check the shape of the dataset
data.shape

(20800, 6)

In [6]:
# check whether there are missing values in the dataset
data.isnull().sum()

id            0
title       558
author     1957
text         39
label         0
source    11812
dtype: int64

In [7]:
# check the distribution of "label" in the dataset
data['label'].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [8]:
# Get the top 5 authors
top_authors = data['author'].value_counts().head(5)

# Get the top 5 sources
top_sources = data['source'].value_counts().head(5)

# Print the results
print("Top 5 authors:")
print(top_authors)
print("\nTop 5 sources:")
print(top_sources)

Top 5 authors:
author
Pam Key             243
admin               193
Jerome Hudson       166
Charlie Spiering    141
John Hayward        140
Name: count, dtype: int64

Top 5 sources:
source
The New York Times    6222
Breitbart             2254
The Onion               76
Russia News Now         50
RT Arabic               15
Name: count, dtype: int64


In [9]:
# Masks for missing and non-missing values in 'author'
missing_author = data['author'].isnull()
not_missing_author = data['author'].notnull()

# Masks for missing and non-missing values in 'source'
missing_source = data['source'].isnull()
not_missing_source = data['source'].notnull()

# Distribution of 'label' for missing and non-missing 'author'
author_missing_label_dist = data.loc[missing_author, 'label'].value_counts()
author_not_missing_label_dist = data.loc[not_missing_author, 'label'].value_counts()

# Distribution of 'label' for missing and non-missing 'source'
source_missing_label_dist = data.loc[missing_source, 'label'].value_counts()
source_not_missing_label_dist = data.loc[not_missing_source, 'label'].value_counts()

# Print the results
print("Label distribution for missing 'author':")
print(author_missing_label_dist)
print("\nLabel distribution for non-missing 'author':")
print(author_not_missing_label_dist)

print("\nLabel distribution for missing 'source':")
print(source_missing_label_dist)
print("\nLabel distribution for non-missing 'source':")
print(source_not_missing_label_dist)

Label distribution for missing 'author':
label
1    1931
0      26
Name: count, dtype: int64

Label distribution for non-missing 'author':
label
0    10361
1     8482
Name: count, dtype: int64

Label distribution for missing 'source':
label
1    10004
0     1808
Name: count, dtype: int64

Label distribution for non-missing 'source':
label
0    8579
1     409
Name: count, dtype: int64


In [10]:
# delete the missing values in text column
data = data.dropna(subset=['title'])

# fill in missing values with empty strings
data = data.fillna('')

In [11]:
# delete the missing values in text column
data = data.dropna(subset=['title'])

# fill in missing values with empty strings
data = data.fillna('')

In [12]:
# merging the title, author name, and text columns
data['content'] = data['title'] + ' ' + data['author'] + ' ' + data['text']

## Stemming

In [13]:
port_stem = PorterStemmer() 

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower() 
    stemmed_content = stemmed_content.split() 
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] 
    stemmed_content = ' '.join(stemmed_content) 
    return stemmed_content

data['content'] = data['content'].apply(stemming)

data['content'].head()

In [26]:
words = [word for row in data['content'] for word in row.split()]

# count the frequency of each word
word_counts = Counter(words)

df_word_counts = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency'])

df_word_counts = df_word_counts.sort_values(by='Frequency', ascending=False)

print(df_word_counts.head(20))

     Word  Frequency
23    the     814841
42     to     421828
108    of     416548
117   and     363386
36      a     344519
25     in     281279
75   that     195537
46     is     150257
205   for     137180
18     on     122625
84    was      99669
145  with      97789
52    The      89583
71     as      89208
271    he      74646
621    by      74019
217  have      70727
622   are      70513
63     it      70100
89     be      69140


In [27]:
# Combine all texts into one large string
text = ' '.join(data['content'])

# Create a word cloud object
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hide the axes
plt.show()

KeyboardInterrupt: 

## Vectorizing (Textual data to numerical data)

In [29]:
data['content'] = data['content'].values
data['label'] = data['label'].values
vectorizer = TfidfVectorizer()
vectorizer.fit(data['content'])

X = vectorizer.transform(data['content'])
data['content'].head()

0    House Dem Aide: We Didn’t Even See Comey’s Let...
1    FLYNN: Hillary Clinton, Big Woman on Campus  D...
2    Why the Truth Might Get You Fired Consortiumne...
3    15 Civilians Killed In Single US Airstrike Hav...
4    Iranian woman jailed for fictional unpublished...
Name: content, dtype: object

## Classification

In [30]:
# Get X and y values
X = X
y = data['label']

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

# 20% testing and 10% validation
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=1/3, random_state=42)


In [31]:
classifier = SVC(kernel="linear")

In [38]:
print(X_train)

  (0, 5404)	0.06715358082131856
  (0, 6617)	0.026078384500183523
  (0, 6662)	0.03307861290125485
  (0, 8102)	0.024829178667655646
  (0, 9315)	0.02091729748400212
  (0, 9504)	0.10275763950401619
  (0, 12422)	0.039094912504131434
  (0, 12867)	0.1156161514810952
  (0, 14910)	0.034457321869708755
  (0, 17020)	0.040698598120079914
  (0, 17119)	0.08384045298875266
  (0, 17341)	0.023907070392065057
  (0, 17385)	0.03142931184393018
  (0, 17416)	0.047427455411629585
  (0, 18755)	0.03692701196536229
  (0, 22372)	0.06261472979482217
  (0, 24399)	0.019640327857536128
  (0, 25238)	0.0388615282984522
  (0, 26391)	0.0427742331942849
  (0, 28590)	0.04734424228897373
  (0, 30335)	0.11613571239636801
  (0, 31605)	0.06866039963868291
  (0, 32214)	0.09164732091506263
  (0, 37598)	0.10010791715277705
  (0, 39473)	0.06614482915747129
  :	:
  (13493, 147336)	0.03119479006199519
  (13493, 147571)	0.05321207241517188
  (13493, 147979)	0.014396736873992563
  (13493, 148269)	0.03719644810959659
  (13493, 148379)

In [39]:
# classifier.fit(X_train, y_train)
classifier.fit(X_train, y_train)

AttributeError: 'csr_matrix' object has no attribute 'values'

In [33]:
X_predict = classifier.predict(X_test)

In [34]:
X_predict

array([1, 0, 0, ..., 0, 0, 1], dtype=int64)

In [35]:
# calculate the accuracy
accuracy = accuracy_score(y_test, X_predict)

print("Accuracy:", accuracy)

Accuracy: 0.9657625611382836


In [36]:
# calculate f1-score
f1 = f1_score(y_test, X_predict)

print("F1-Score:", f1)

F1-Score: 0.9648882808937529


In [40]:
from sklearn.linear_model import LogisticRegression
model =  LogisticRegression()  # 建模
model.fit(X_train,  y_train)  # 拟合
y_pred =  model.predict(X_test)  # 测试

In [41]:
accuracy_score(y_test,  y_pred)

0.9486438417074256