In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [2]:
args = Namespace(
    
    raw_train_dataset_csv="data/fake_news/train.csv",
    raw_test_dataset_csv="data/fake_news/test.csv",
    proportion_subset_of_train=0.1,
    proportion_subset_of_test=0.1,
    train_proportion=0.8,
    val_proportion=0.2,
    output_train_val_csv="data/fake_news/train_val_preprocessed_LITE.csv",
    output_munged_csv="data/fake_news/preprocessed_LITE.csv",
    output_test_csv="data/fake_news/test_preprocessed.csv",
    seed=42
    
)

In [3]:
train_news = pd.read_csv(args.raw_train_dataset_csv, index_col='id')
test_news = pd.read_csv(args.raw_test_dataset_csv, index_col='id')

In [4]:
train_news.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


## Train on a subset of the full train dataset

In [5]:
# group by label 
by_label = train_news.groupby('label')

# a dataframe to store the subset of the train dataset
news_subset = pd.DataFrame(columns = train_news.columns)

for name, group in by_label:
    # sample a fraction of each group(by label) and append to news_subset
    news_subset = pd.concat([news_subset, group.sample(frac=args.proportion_subset_of_train)])

news_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2080 entries, 8008 to 15528
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   2031 non-null   object
 1   author  1885 non-null   object
 2   text    2077 non-null   object
 3   label   2080 non-null   object
dtypes: object(4)
memory usage: 81.2+ KB


## Train dataset and Validation dataset split

In [6]:
# group by label 
by_label = news_subset.groupby('label')

np.random.seed(args.seed)

columns = list(news_subset.columns)
train_news_final = pd.DataFrame(columns = columns.append('split'))

for name, group in by_label:
    n_total = group.shape[0]
    n_train = int(args.train_proportion * n_total)
    
    # shuffle rows 
    group = group.sample(frac=1)
    
    # split 
    train, val = np.split(group, [n_train])
    train['split'] = 'train'
    val['split'] = 'val'
    
    train_news_final = pd.concat([train_news_final, train, val])
    

train_news_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2080 entries, 7247 to 10907
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   2031 non-null   object
 1   author  1885 non-null   object
 2   text    2077 non-null   object
 3   label   2080 non-null   object
 4   split   2080 non-null   object
dtypes: object(5)
memory usage: 97.5+ KB


## Preprocessing 

In [7]:
# Preprocess
def preprocess_text(text):
    text = str(text).lower()
    
    # match a single character present in the group and add space before and after the character
    # (\1 - refers to the matched character)
    text = re.sub(r"([.,!?])", r" \1 ", text)
    
    # match a single character not present in the group and replace it with space
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
train_news_final.text = train_news_final.text.apply(preprocess_text)
train_news_final.title = train_news_final.title.apply(preprocess_text)

In [8]:
test_news.text = test_news.text.apply(preprocess_text)
test_news.title = test_news.title.apply(preprocess_text)
test_news['split'] = 'test'

test_news_final = test_news.sample(frac=args.proportion_subset_of_test)

news_final = pd.concat([train_news_final, test_news_final])

In [9]:
news_final.groupby('split').count()

Unnamed: 0_level_0,title,author,text,label
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test,520,459,520,0
train,1663,1510,1663,1663
val,417,375,417,417


In [12]:
news_final.sample(10)

Unnamed: 0,title,author,text,label,split
529,comment on gold medalist wrestler gets violent...,Buck Rogers,"usa created isis , all european know about tha...",1.0,val
15879,", pounds of would be pennies coat highway aft...",Jonah Engel Bromwich,it looked like free money all over the highway...,0.0,val
20788,maine s gov . lepage threatens to investigate ...,Joe Clark,google pinterest digg linkedin reddit stumbleu...,1.0,val
21208,"j . geils , whose band s catchy pop hits color...",Niraj Chokshi,"j . geils , the guitarist who lent his name to...",,test
350,trump adviser says israeli settlements not ill...,Middle East Eye,trump adviser says israeli settlements not ill...,1.0,train
8669,quiz which matters most to mental health ? rai...,Heather Callaghan,"by heather callaghan , editor there s no denyi...",1.0,train
23825,new tests put perspective to x reporting limit...,Heather Callaghan,by cassius kamarampi era of wisdom new rain te...,,test
16025,alleged mexican cartel operatives charged in ...,Ryan Saavedra,a federal grand jury returned an indictment ch...,0.0,val
18996,trump warns of world war iii if clinton is ele...,,email donald trump warned in an interview tues...,1.0,train
24445,now pak pm nawaz sharif announces demonetizati...,UnReal Mama,now pak pm nawaz sharif announces demonetizati...,,test


In [13]:
# train_news_final.to_csv(args.output_munged_csv, index=False)
# test_news.to_csv(args.output_test_csv, index=False)
news_final.to_csv(args.output_munged_csv)

In [14]:
news_final.dtypes

title     object
author    object
text      object
label     object
split     object
dtype: object