# Descriptive Analysis Script
**Applying techniques to learn about the datasets**

**Importing libraries**

In [11]:
import pandas as pd
import numpy as np
import datetime as dt

**1. Change the directory to dataset directory**

**2. While running test script comment out the line below**

In [39]:
# cd "D:\University\FIT3162\FAKEDDIT"

In [36]:
def perform_statistics(df, clean=True):
    """
    Clean (True) Dataset Refers to cleaned_df.csv dataset that has been processed
    (False) refers to the raw dataset which is unprocessed.
    Apply function to read the post dataset and analyze them
    param: dataframe to be analyzed (Clean and Uncleaned)
    return: decriptive analysis on the dataset. An output file that is used for the test cases
    """
    total = len(df)
    fake_articles = len(df.loc[df['2_way_label'] == 0])
    true_articles = len(df.loc[df['2_way_label'] == 1])
    num_of_subreddits = df['subreddit'].nunique()
    num_of_domains = df['domain'].nunique()
    mean_comments = df['num_comments'].mean()
    mean_title_len = df['title'].replace(np.nan, '').str.split().apply(len).mean()
    if clean:
        df['created_utc'] = pd.to_datetime(df['created_utc'])
        min_date = min(df['created_utc'])
        max_date = max(df['created_utc'])
    else:
        min_date = dt.datetime.fromtimestamp(min(df['created_utc']))
        max_date = dt.datetime.fromtimestamp(max(df['created_utc']))
        
    

    print("-" * 30)
    print("Total Samples:", total)
    print("-" * 30)
    print("Fake Samples:", fake_articles)
    print("-" * 30)
    print("True Samples:", true_articles)
    print("-" * 30)
    print("Unique Subreddits:", num_of_subreddits)
    print("-" * 30)
    print("Unique Domains:", num_of_domains)
    print("-" * 30)
    print("Mean No of Comments:", mean_comments)
    print("-" * 30)
    print("Mean Words in title:", mean_title_len)
    print("-" * 30)
    print("Min Creation Date:", min_date)
    print("-" * 30)
    print("Max Creation Date:", max_date)
    print("-" * 30)
    
    new_df = pd.DataFrame(columns=['total','fake_articles','true_articles','num_subreddit','num_domain','mean_comment','mean_title','min_date','max_date'])

    # output used for test cases
    new_df.to_csv(r'D:\University\FIT3162\Project\Fake-News-Detection\Descriptive Analysis\post_stats_output.csv',index=False)
    
    file = open(r'D:\University\FIT3162\Project\Fake-News-Detection\Descriptive Analysis\post_stats_output.csv', 'a')
    file.write('{}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(total, fake_articles, true_articles, num_of_subreddits, num_of_domains,mean_comments,mean_title_len,min_date,max_date))



In [40]:
def check_comment_stats(df):
     """
    Apply function to read the comment dataset and analyze them
    param: dataframe to be analyzed
    return: decriptive analysis on the dataset. An output file that is used for the test cases
    """
    total = len(df)
    unique_posts = df['submission_id'].nunique()
    print("-" * 30)
    print("Total Comments:", total)
    print("-" * 30)
    print("Unique Posts with comments:", unique_posts)
    print("-" * 30)
    
    new_df = pd.DataFrame(columns=['total','unique_posts'])

    # output used for test cases
    new_df.to_csv(r'D:\University\FIT3162\Project\Fake-News-Detection\Descriptive Analysis\post_comments_stats_output.csv',index=False)
    
    file = open(r'D:\University\FIT3162\Project\Fake-News-Detection\Descriptive Analysis\post_comments_stats_output.csv', 'a')
    file.write('{}, {}\n'.format(total, unique_posts))

**Calling the analysis functions**

In [41]:
if __name__ == '__main__':
    print("Descriptive Analysis")
    post_df = pd.read_csv("cleaned_df.csv")
    perform_statistics(post_df)
    comment_df = pd.read_csv("cleaned_comments.csv")
    check_comment_stats(comment_df)

Descriptive Analysis
------------------------------
Total Comments: 4428085
------------------------------
Unique Posts with comments: 53265
------------------------------
