# Data Preprocessing Script
**Applying pre-processing techniques to the dataset**

**Importing libraries**

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import glob, os

**1. Change the directory to dataset directory**

**2. While running test script comment out the line below**

In [2]:
#cd "D:\University\FIT3162\Project\Fake-News-Detection\Data Preprocessing"

D:\University\FIT3162\Project\Fake-News-Detection\Data Preprocessing


In [3]:
def remove_unwanted_columns(df, list_remove):
    """
    Any unwanted columns that are not required in the dataset are removed 
    by passing a list of columns to the drop function.
    param df: dataframe from which the unwanted columns are to be deleted
    return: dataframe which now does not contain the unwanted columns
    """
    df = df.drop(list_remove, axis=1)
    return df

In [4]:
def delete_duplicated_posts(df):
    """
    Duplication of posts are eliminated by checking the id of the post or comment.
    Since the same id cannot exist twice, such posts are deleted
    :param df: dataframe from which duplicated posts must be deleted
    :return: dataframe which now contains unique posts or comments only
    """
    # deleting duplicated posts
    df = df.drop_duplicates(subset=['id'], keep=False)
    return df 

In [44]:
def delete_empty_posts(df, subset_list):
    """
    This function deletes all empty posts in the dataset.
    Empty posts are NaN
    :param filename: name of the file from which the empty posts might be deleted
    :param subset list of column names to remove empty values
    :return: a dataframe holding the dataset after removing empty posts
    """
    df = df.dropna(subset = subset_list)
    return df

In [45]:
def delete_removed_comments(df):
    """
    This function deletes all removed or deleted comments in the dataset.
    Empty posts can be denoted as [deleted] or [removed] or NaN
    :param filename: name of the file from which the empty posts might be deleted
    :param subset list of column names to remove empty values
    :return: a dataframe holding the dataset after removing empty posts
    """
    # Remove all rows which are deleted or removed
    df = df.loc[(df['body'] != "[deleted]") & (df['body'] != "[removed]")]
    return df

In [46]:
def remove_subreddits(df):
    """
    We are dealing with text only so all sub reddits where images occur those rows
    get removed. 
    param df: dataframe from which the non text rows are to be deleted
    return: dataframe which now contains only text
    """
    sub_to_remove = ["photoshopbattles", "pic" , "mildlyinteresting", "fakealbumcovers", "propagandaposters",
                 "misleadingthumbnails", "pareidolia", "psbattle_artwork", "fakehistoryporn"]
    df = df[~df['subreddit'].isin(sub_to_remove)]
    return df

In [47]:
# Change Date from UTC to Datetime
def get_date(created):
    """
    Gets the correct date in date time format from timestamp format
    param: the timestamp date
    return: the converted date in date time format
    """
    return dt.datetime.fromtimestamp(created)

def change_date(df):
    """
    Apply the get_date function to the created_utc column
    param: dataframe from which the time stamp is to be converted
    return: the converted date frame in date time format
    """
    df["created_utc"] = df["created_utc"].apply(get_date)
    return df

In [48]:
def filter_posts(post_df, min_comments):
    """
    Filter the num_comments according to the param passed for min_comments
    param: dataframe from which the comments will be filtered, the minimum amount of comments
    return: the filtered dataframe where num_comments are greater than specified
    """
    post_df = (post_df.loc[post_df['num_comments'] >= min_comments])
    return post_df

In [49]:
def filter_comments(df, ids):
    """
    Shows only the comments that are matched with the passed list of ids
    param: dataframe of comments, list of ids that are to be filtered for
    return: the dataframe comments showed only for the passed in ids
    """
    df = df[df['submission_id'].isin(ids)]
    return df

In [1]:
def read_post_dataset():
    """
    Reads the post dataset and applies concat to the test and train tsv files
    return: the converted post dataset
    """
    train_df = pd.read_csv("datasetv2.0/train.tsv", sep='\t')
    test_df = pd.read_csv("datasetv2.0/test_public.tsv", sep='\t')
    validate_df = pd.read_csv("datasetv2.0/validate.tsv", sep='\t')
    post_df = pd.concat([train_df, test_df], axis=0)
    post_df = pd.concat([post_df, validate_df], axis=0)
    del train_df
    del test_df
    del validate_df
    return post_df

In [51]:
def read_comment_dataset():
    """
    Read all comment files in the directory 
    return: the converted comments dataset
    """
    comment_df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('comments', "comment*.csv"))))
    comment_df = comment_df.reset_index(drop = True)
    return comment_df

**The main functions that Pre-procesess the posts and comments by applying all the above functions**

In [52]:
def preprocess_posts():
    """
    Apply all post functions to dataset for precprocessing
    return: the pre-processed post dataset
    """
    post_df = read_post_dataset()
    remove_col = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1', 'hasImage',
       'image_url', 'linked_submission_id', '3_way_label', '6_way_label', 'author']
    post_df = remove_unwanted_columns(post_df, remove_col)
    post_df = delete_empty_posts(post_df, subset_list=['title'])
    post_df = delete_duplicated_posts(post_df)
    post_df = remove_subreddits(post_df)
    post_df = filter_posts(post_df, 20)
    post_df = change_date(post_df)
    post_df = post_df.reset_index(drop = True)
    return post_df

In [53]:
def preprocess_comments(ids):
    """
    Apply all the comment functions for preprocessing
    param: the ids for which the comments are to be shown
    return: the pre-processed comments dataset
    """
    comment_df = read_comment_dataset()
    comment_df = delete_removed_comments(comment_df)
    comment_df = delete_empty_posts(comment_df, ['submission_id', 'body'])
    comment_df = remove_unwanted_columns(comment_df, ['Column1'])
    comment_df = filter_comments(comment_df, ids)
    comment_df = comment_df.reset_index(drop = True)
    return comment_df

**The main functions are called here and the dataset is exported to csv format in the directory**

In [54]:
if __name__ == '__main__':
    print("Data Preprocessing")
    post_df = preprocess_posts()
    ids = post_df.id.unique()
    comment_df = preprocess_comments(ids)
    post_df.to_csv("cleaned_df.csv", encoding='utf-8-sig')
    comment_df.to_csv("cleaned_comments.csv", encoding='utf-8-sig')

Data Preprocessing


  
