In [2]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [3]:
args = Namespace(
    
    raw_train_dataset_csv="data/fake_news/train.csv",
    raw_test_dataset_csv="data/fake_news/test.csv",
    proportion_subset_of_train=1,
    proportion_subset_of_test=1,
    train_proportion=0.8,
    val_proportion=0.2,
    output_train_val_csv="data/fake_news/train_val_preprocessed_FULL.csv",
    output_munged_csv="data/fake_news/preprocessed_FULL.csv",
    output_test_csv="data/fake_news/test_preprocessed.csv",
    seed=42
    
)

In [4]:
train_news = pd.read_csv(args.raw_train_dataset_csv, index_col='id')
test_news = pd.read_csv(args.raw_test_dataset_csv, index_col='id')

In [5]:
train_news.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


## Train on a subset of the full train dataset

In [6]:
# group by label 
by_label = train_news.groupby('label')

# a dataframe to store the subset of the train dataset
news_subset = pd.DataFrame(columns = train_news.columns)

for name, group in by_label:
    # sample a fraction of each group(by label) and append to news_subset
    news_subset = pd.concat([news_subset, group.sample(frac=args.proportion_subset_of_train)])

news_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20800 entries, 12007 to 16673
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20800 non-null  object
dtypes: object(4)
memory usage: 812.5+ KB


## Train dataset and Validation dataset split

In [7]:
# group by label 
by_label = news_subset.groupby('label')

np.random.seed(args.seed)

columns = list(news_subset.columns)
train_news_final = pd.DataFrame(columns = columns.append('split'))

for name, group in by_label:
    n_total = group.shape[0]
    n_train = int(args.train_proportion * n_total)
    
    # shuffle rows 
    group = group.sample(frac=1)
    
    # split 
    train, val = np.split(group, [n_train])
    train['split'] = 'train'
    val['split'] = 'val'
    
    train_news_final = pd.concat([train_news_final, train, val])
    

train_news_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20800 entries, 19853 to 8742
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20800 non-null  object
 4   split   20800 non-null  object
dtypes: object(5)
memory usage: 975.0+ KB


## Preprocessing 

In [8]:
# Preprocess
def preprocess_text(text):
    text = str(text).lower()
    
    # match a single character present in the group and add space before and after the character
    # (\1 - refers to the matched character)
    text = re.sub(r"([.,!?])", r" \1 ", text)
    
    # match a single character not present in the group and replace it with space
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
train_news_final.text = train_news_final.text.apply(preprocess_text)
train_news_final.title = train_news_final.title.apply(preprocess_text)

In [9]:
test_news.text = test_news.text.apply(preprocess_text)
test_news.title = test_news.title.apply(preprocess_text)
test_news['split'] = 'test'

test_news_final = test_news.sample(frac=args.proportion_subset_of_test)

news_final = pd.concat([train_news_final, test_news_final])

In [10]:
news_final.groupby('split').count()

Unnamed: 0_level_0,title,author,text,label
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test,5200,4697,5200,0
train,16639,15074,16639,16639
val,4161,3769,4161,4161


In [11]:
news_final.sample(10)

Unnamed: 0,title,author,text,label,split
20718,this is the best picture in human history dail...,,this is the best picture in human history by n...,1.0,train
3611,"caroline kennedy dances in a christmas video ,...",Mike Ives,hong kong the japanese public pays careful att...,0.0,train
12462,shock and gore mexican matador gets bull s hor...,Warner Todd Huston,bull fighting fans in mexico were shocked this...,0.0,val
1058,gatlinburg residents return home to wildfire d...,Richard Fausset,"gatlinburg , tenn . the annual fantasy of ligh...",0.0,train
24696,the ancient gods releasing matrix control,Gillian,leave a reply david manning our evolution is s...,,test
23999,flynn is said to have talked to russians about...,Matthew Rosenberg and Matt Apuzzo,washington weeks before president trump s inau...,,test
23604,breaking obama now considering martial law bec...,,"email print in every competition out there , t...",,test
15981,protesters disturb ryan lochte s debut on danc...,Jonah Engel Bromwich,"the olympic swimmer ryan lochte , making his d...",0.0,val
22925,michael hudson on meet the renegades,Yves Smith,lambert strether on pm water cooler i don t kn...,,test
6633,i m so tired of all these psychotics babbling ...,Craazee8,i m so tired of all these psychotics babbling ...,1.0,train


In [12]:
# train_news_final.to_csv(args.output_munged_csv, index=False)
# test_news.to_csv(args.output_test_csv, index=False)
news_final.to_csv(args.output_munged_csv)

In [13]:
news_final.dtypes

title     object
author    object
text      object
label     object
split     object
dtype: object