In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('../liar_dataset/test.tsv', sep='\t', header=None, usecols=[1,2])
data = data.rename(columns={1: 'type', 2: 'content'})
data

Unnamed: 0,type,content
0,true,Building a wall on the U.S.-Mexico border will...
1,false,Wisconsin is on pace to double the number of l...
2,false,Says John McCain has done nothing to help the ...
3,half-true,Suzanne Bonamici supports a plan that will cut...
4,pants-fire,When asked by a reporter whether hes at the ce...
...,...,...
1262,half-true,Says his budget provides the highest state fun...
1263,barely-true,Ive been here almost every day.
1264,barely-true,"In the early 1980s, Sen. Edward Kennedy secret..."
1265,barely-true,Says an EPA permit languished under Strickland...


In [4]:
list(set(data['type']))

['true', 'barely-true', 'pants-fire', 'false', 'mostly-true', 'half-true']

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import pandas as pd
import re

nltk.download('stopwords')
nltk.download('punkt')
stopwords = stopwords.words('english')

def full_clean(text: str, stopwords=stopwords):
    text = text.lower()

    text = re.sub(r'\n', ' ', text) # Remove newlines
    text = re.sub(r' +', ' ', text) # Remove multiple spaces

    text = re.sub(r'([a-zA-Z]+) (\d+)[, ]? (\d{4})', '<DATE>', text) # Date substitution
    text = re.sub(r'([.a-zA-Z0-9]+)@([-a-zA-Z0-9]+).([a-zA-Z]+)', '<EMAIL>', text) # E-Mail substitution
    text = re.sub(r'(https?:\/\/)?(www.)?([-.a-zA-Z0-9]+)[.](co.uk|com|org|net)\/?([\%\-\.\?\_=a-zA-Z0-9\/]+)?', '<URL>', text) # URL substitution
    text = re.sub(r'[0-9]+', '<NUM>', text) # Number substitution

    stemmer = PorterStemmer()                                   # Porter Stemmer from nltk
    tokens = nltk.word_tokenize(text)                           # Tokenizing the text
    tokens = [word for word in tokens if word.isalpha()]        # Removing punctuation
    tokens = [word for word in tokens if word not in stopwords] # Removing Stopwords
    tokens = [stemmer.stem(word) for word in tokens]            # Stemming all the words
    return ' '.join(tokens) # Returning a string consisting of each word in the list

data['content'] = data['content'].apply(full_clean)

def is_credible(article_type):
    if article_type in ['false', 'pants-fire', 'barely-true']:
        return 0
    elif article_type in ['half-true', 'mostly-true', 'true']:
        return 1
    
data['type'] = data['type'].apply(is_credible)

print(data)

# Store Results

# Save Cleaned Data
# df.to_csv("clean_liar_test.csv", index=False)

# Display Sample Results

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


      type                                            content
0        1                  build wall border take liter year
1        0            wisconsin pace doubl number layoff year
2        0                 say john mccain done noth help vet
3        1  suzann bonamici support plan cut choic medicar...
4        0  ask report whether center crimin scheme violat...
...    ...                                                ...
1262     1  say budget provid highest state fund level his...
1263     0                               ive almost everi day
1264     0  earli num edward kennedi secretli offer help s...
1265     0  say epa permit languish strickland new epa dir...
1266     0  say governor go around state talk fund incom t...

[1267 rows x 2 columns]
