# Parameters

In [1]:
raw_data_path = '/home/Danny/pytorch/dataset/news.csv'
destination_folder = '/home/Danny/pytorch/dataset/'

train_test_ratio = 0.10
train_valid_ratio = 0.80

first_n_words = 200

# Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Preprocessing

In [3]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

In [4]:
# Read raw data
df_raw = pd.read_csv(raw_data_path)

# Prepare columns
df_raw['label'] = (df_raw['label'] == 'FAKE').astype('int')
df_raw['titletext'] = df_raw['title'] + ". " + df_raw['text']
df_raw = df_raw.reindex(columns=['label', 'title', 'text', 'titletext'])

# Drop rows with empty text
df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)

# Trim text and titletext to first_n_words
df_raw['text'] = df_raw['text'].apply(trim_string)
df_raw['titletext'] = df_raw['titletext'].apply(trim_string) 

# Split according to label
df_real = df_raw[df_raw['label'] == 0]
df_fake = df_raw[df_raw['label'] == 1]

# Train-test split
df_real_full_train, df_real_test = train_test_split(df_real, train_size = train_test_ratio, random_state = 1)
df_fake_full_train, df_fake_test = train_test_split(df_fake, train_size = train_test_ratio, random_state = 1)

# Train-valid split
df_real_train, df_real_valid = train_test_split(df_real_full_train, train_size = train_valid_ratio, random_state = 1)
df_fake_train, df_fake_valid = train_test_split(df_fake_full_train, train_size = train_valid_ratio, random_state = 1)

# Concatenate splits of different labels
df_train = pd.concat([df_real_train, df_fake_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_real_valid, df_fake_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_real_test, df_fake_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)

In [5]:
df_train

Unnamed: 0,label,title,text,titletext
0,0,Obamacare's unlikely No. 1 city,"Killing Obama administration rules, dismantlin...",Obamacare's unlikely No. 1 city. Killing Obama...
1,0,New York restores order for 2016 front-runners,Hillary Clinton and Donald Trump scored resoun...,New York restores order for 2016 front-runners...
2,0,These political scientists may have just disco...,There's a lot of disgust in America with polit...,These political scientists may have just disco...
3,0,"Hillary supporters: We're excited, too, but al...",Trump and Sanders get all the attention for th...,"Hillary supporters: We're excited, too, but al..."
4,0,Hung jury in Jodi Arias sentencing phase remov...,The jury in the Jodi Arias case tasked with de...,Hung jury in Jodi Arias sentencing phase remov...
...,...,...,...,...
497,1,John Pilger: ‘The truth is… there was no one t...,Leave a Reply Click here to get more info on f...,John Pilger: ‘The truth is… there was no one t...
498,1,DEVELOPING: FBI Reopens Investigation into Cli...,The Clinton email scandal has taken an unexpec...,DEVELOPING: FBI Reopens Investigation into Cli...
499,1,Will Hillary Accept Defeat?,Print The headlines are in. Trump is the “anti...,Will Hillary Accept Defeat?. Print The headlin...
500,1,How to Solve the Illegal Immigration Problem,Republican presidential candidate Donald Trump...,How to Solve the Illegal Immigration Problem. ...


In [6]:
df_valid

Unnamed: 0,label,title,text,titletext
0,0,OnPolitics | 's politics blog,Who has Trump appointed to his cabinet so far?...,OnPolitics | 's politics blog. Who has Trump a...
1,0,Who Lost Iraq?,"For a brief, happy—and misguided—moment, most ...","Who Lost Iraq?. For a brief, happy—and misguid..."
2,0,Robert Durst of HBO's 'The Jinx' charged with ...,(CNN) Millionaire real estate heir Robert Durs...,Robert Durst of HBO's 'The Jinx' charged with ...
3,0,The Iran Deal and the Cost of Political Polari...,"In just a week or two, Congress will consider ...",The Iran Deal and the Cost of Political Polari...
4,0,Is Clinton ready for the Wild West campaign of...,Presidential campaigns are always studies in c...,Is Clinton ready for the Wild West campaign of...
...,...,...,...,...
122,1,Comment on Election 2016: Playing a Game of Ch...,"by Yves Smith By Michael T. Klare, a professor...",Comment on Election 2016: Playing a Game of Ch...
123,1,Prowling Amur tiger nabbed near Vladivostok,Prowling Amur tiger nabbed near Vladivostok Oc...,Prowling Amur tiger nabbed near Vladivostok. P...
124,1,Anonymous: Hillary Could Be In Handcuffs In 72...,"Obama and Michelle are out, she could be haule...",Anonymous: Hillary Could Be In Handcuffs In 72...
125,1,Hate Rising with Jorge Ramos,"Hate Rising with Jorge Ramos Fusion, October 2...",Hate Rising with Jorge Ramos. Hate Rising with...


In [7]:
df_test

Unnamed: 0,label,title,text,titletext
0,0,The 8th Democratic Debate In 100 Words (And 4 ...,The 8th Democratic Debate In 100 Words (And 4 ...,The 8th Democratic Debate In 100 Words (And 4 ...
1,0,Supreme Court throws out conviction for violen...,The Supreme Court on Monday made it harder for...,Supreme Court throws out conviction for violen...
2,0,John Kerry: ISIS responsible for genocide,(CNN) Secretary of State John Kerry said Thurs...,John Kerry: ISIS responsible for genocide. (CN...
3,0,The GOP case against Loretta Lynch falls apart,Loretta Lynch had them at Jim Crow. Senate Rep...,The GOP case against Loretta Lynch falls apart...
4,0,Is Campus Rape Really An Epidemic?,A documentary about campus rape contains the d...,Is Campus Rape Really An Epidemic?. A document...
...,...,...,...,...
5665,1,AMERICAN EVIL,Share This: By Joe Giambrone What if everythin...,AMERICAN EVIL. Share This: By Joe Giambrone Wh...
5666,1,BREAKING: WikiLeaks Just Released Full ISIS Do...,"in: Government , Government Corruption , Obama...",BREAKING: WikiLeaks Just Released Full ISIS Do...
5667,1,"World Proud Of Its Calm, Measured Response To ...",0 Add Comment IN THE immediate aftermath of Do...,"World Proud Of Its Calm, Measured Response To ..."
5668,1,Blind Mystic Who Predicted 9/11 Has Bad News A...,Remember the blind Bulgarian mystic who predic...,Blind Mystic Who Predicted 9/11 Has Bad News A...
