In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [4]:
#Loading training dataset 
df = pd.read_csv('./data/liar-fake-news-dataset/train.tsv', sep='\t')
df.head()


Unnamed: 0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece


In [5]:
df.describe()

Unnamed: 0,0,1,0.1,0.2,0.3
count,10237.0,10237.0,10237.0,10237.0,10237.0
mean,11.534336,13.287682,17.135391,16.43587,6.202012
std,18.974349,24.113808,35.847862,36.153089,16.129599
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,2.0,2.0,3.0,3.0,1.0
75%,12.0,12.0,13.0,11.0,5.0
max,70.0,114.0,160.0,163.0,105.0


In [6]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10239 entries, 0 to 10238
Data columns (total 14 columns):
 #   Column                                                                              Non-Null Count  Dtype  
---  ------                                                                              --------------  -----  
 0   2635.json                                                                           10239 non-null  object 
 1   false                                                                               10239 non-null  object 
 2   Says the Annies List political group supports third-trimester abortions on demand.  10239 non-null  object 
 3   abortion                                                                            10237 non-null  object 
 4   dwayne-bohac                                                                        10237 non-null  object 
 5   State representative                                                                7341 non-nu

In [7]:
#Dropping NaN values
df.dropna(inplace=True)  # Or df.fillna('Unknown', inplace=True)


In [8]:
#Adding headers for columns 
column_headers = [
    'id', 'label', 'statement', 'subjects', 'speaker', 'speaker_job',
    'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
    'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'
]

df = pd.read_csv('./data/liar-fake-news-dataset/test.tsv', delimiter='\t', names=column_headers, header=None)


In [9]:
import spacy
import re

# Load SpaCy's English-language model
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    #replacing non-word characters with a space (special characters etc...)
    text = re.sub(r'\W', ' ', str(text))

    #converting all characters to lower case for uniformity
    text = text.lower()

    #removing single characters
    text = re.sub(r'\s+[a-z]\s+', ' ', text)

    #removing single characters from the start of the text 
    text = re.sub(r'^[a-z]\s+', ' ', text)

    #reducing multiple spaces to 1 single space
    text = re.sub(r'\s+', ' ', text)
    
    #Parcing the text
    doc = nlp(text)
    
    #tokenizing text into words
    processed_text = ' '.join(token.lemma_ for token in doc if not token.is_stop and not token.is_space)
    return processed_text

#Applying the preprocess function to the "Statement" column
df['cleaned_text'] = df['statement'].apply(preprocess_text)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1267 entries, 0 to 1266
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    1267 non-null   object
 1   label                 1267 non-null   object
 2   statement             1267 non-null   object
 3   subjects              1267 non-null   object
 4   speaker               1267 non-null   object
 5   speaker_job           942 non-null    object
 6   state_info            1005 non-null   object
 7   party_affiliation     1267 non-null   object
 8   barely_true_counts    1267 non-null   int64 
 9   false_counts          1267 non-null   int64 
 10  half_true_counts      1267 non-null   int64 
 11  mostly_true_counts    1267 non-null   int64 
 12  pants_on_fire_counts  1267 non-null   int64 
 13  context               1250 non-null   object
 14  cleaned_text          1267 non-null   object
dtypes: int64(5), object(10)
memory usage: 

In [11]:
df.describe()


Unnamed: 0,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts
count,1267.0,1267.0,1267.0,1267.0,1267.0
mean,11.770324,13.465667,17.566693,16.912391,6.016575
std,18.980496,23.956307,36.10929,36.499879,15.059981
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,1.0,0.0
50%,3.0,3.0,3.0,3.0,1.0
75%,12.0,17.0,15.0,14.0,6.0
max,70.0,114.0,160.0,163.0,105.0


In [12]:
df.shape

(1267, 15)

In [15]:
#Addt'l preprocessing steps 
#Expanding Contractions

import re

contractions_dict = {
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "I'd": "I would",
    "I'll": "I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "let's": "let us",
    "mightn't": "might not",
    "mustn't": "must not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "we'd": "we would",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

    
    

SyntaxError: unterminated string literal (detected at line 7) (426214553.py, line 7)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

#converting the preprocessed text into numerical format for model to comprehend
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['cleaned_text'])
