## Imports

In [1]:
import pandas as pd
import numpy as np

## Dataset Import

In [5]:
df = pd.read_csv("politifact_clean_binarized_train.csv")
df = df[['statement', 'veracity']]
print(df.head())
df.shape

                                           statement  veracity
0  Sen. Kamala Harris is "supporting the animals ...         0
1  Says Ronald Reagan said immigrants "brought wi...         1
2  Says Democratic Senators "demand Supreme Court...         0
3  "Tim Kaine doesn’t want a border at all. He wa...         0
4                 "George H.W. Bush has died at 94."         0


(10070, 2)

In [6]:
exp = df[:5]
exp

Unnamed: 0,statement,veracity
0,"Sen. Kamala Harris is ""supporting the animals ...",0
1,"Says Ronald Reagan said immigrants ""brought wi...",1
2,"Says Democratic Senators ""demand Supreme Court...",0
3,"""Tim Kaine doesn’t want a border at all. He wa...",0
4,"""George H.W. Bush has died at 94.""",0


## Data Manipulation Functions

In [7]:
def insert(df, row):
    insert_loc = df.index.max()
    
    if pd.isna(insert_loc):
        df.loc[0] = row
    else:
        df.loc[insert_loc + 1] = row

In [8]:
def listToString(s):
    return (' '.join(s))

## Perform Augmentation Operation

In [26]:
# Create a new dataframe
df_new = pd.DataFrame(columns=["statement", "veracity"])
print(df_new.head())

Empty DataFrame
Columns: [statement, veracity]
Index: []


In [None]:
# The cell below takes around 30 minutes to run 

In [27]:
for i in range(len(df)):
    
    l = df['statement'][i].split()
    
    res = []
    for j in range(2, len(l), 2):
        insert(df_new, [listToString(l[:j]), df['veracity'][i]])  
    insert(df_new, [listToString(l), df['veracity'][i]])

## Check Results

In [28]:
print(df_new.shape)

(102893, 2)


In [29]:
print(df_new[:100])

                                            statement veracity
0                                         Sen. Kamala        0
1                               Sen. Kamala Harris is        0
2               Sen. Kamala Harris is "supporting the        0
3    Sen. Kamala Harris is "supporting the animals of        0
4   Sen. Kamala Harris is "supporting the animals ...        0
..                                                ...      ...
95  Says a progressive income tax proposal from De...        0
96  Says a progressive income tax proposal from De...        0
97  Says a progressive income tax proposal from De...        0
98  Says a progressive income tax proposal from De...        0
99                                     Says Wisconsin        0

[100 rows x 2 columns]


## Save to csv

In [30]:
df.columns

Index(['statement', 'veracity'], dtype='object')

In [31]:
df_new.to_csv('politifact_binarized_augmented.csv', index=False)

## Check saved csv

In [9]:
df2 = pd.read_csv('politifact_binarized_augmented.csv')
df2.head()

Unnamed: 0,statement,veracity
0,Sen. Kamala,0
1,Sen. Kamala Harris is,0
2,"Sen. Kamala Harris is ""supporting the",0
3,"Sen. Kamala Harris is ""supporting the animals of",0
4,"Sen. Kamala Harris is ""supporting the animals ...",0


In [10]:
df2.columns

Index(['statement', 'veracity'], dtype='object')

In [11]:
df2.shape

(102893, 2)

In [12]:
train_augmented = df2[:90000]
train_augmented.shape

(90000, 2)

In [13]:
train_augmented.to_csv("politifact_train_augmented.csv", index=False)

## Check test data

In [14]:
test = pd.read_csv('politifact_clean_binarized_test.csv')
test.columns

Index(['statement', 'source', 'link', 'veracity'], dtype='object')

In [15]:
test.shape

(1118, 4)