# Data Analysis
This file will contains all initial analysis of the data and plot it.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk import FreqDist
from config import processed_dir_path
from data_processing.vocabulary import token_youtube_link, token_internet_link, token_emoticon_funny, compute_sentiment_of_comments
from nltk.corpus import stopwords
set(stopwords.words('english'))
nltk.download('vader_lexicon')

### calculate_most_common_words
Find the most recurrent words for each subreddit. The comments are cleaned beforehand (stopwords and punctuation removed)

In [None]:
def calculate_most_common_words(data, top_nb = 5):
    print("Computing most common words...")
    
    fdist_allsubs = FreqDist()
    list_subreddit_freq = []
    subreddit_names = data.subreddits.unique()
    
    for subreddit in subreddit_names:
        print("\t" + subreddit)
        
        # get data associated to subreddit
        subreddit_data = data.loc[data["subreddits"] == subreddit]
        
        # tokenize
        subreddit_comments = nltk.tokenize.word_tokenize(subreddit_data["comments"].str.cat(sep="\n"))
        # remove stopswords and punctuation
        subreddit_comments = [word for word in subreddit_comments if (word not in stopwords.words('english') and word.isalpha())]
        
        # get frequency distribution of all words
        fdist_subreddit = nltk.FreqDist(subreddit_comments)
        
        fdist_subreddit.plot(5, cumulative = False)
        plt.show()
        
        print(fdist_subreddit)
        print(fdist_subreddit.most_common(top_nb))
        list_subreddit_freq.append(fdist_subreddit)
        fdist_allsubs += fdist_subreddit
    
    print("\tall subreddits")
    print(fdist_allsubs)
    print(fdist_allsubs.most_common(top_nb))
    
    i = 0
    for subreddit_freq in list_subreddit_freq:
        idtf(fdist_allsubs, subreddit_freq, subreddit_names[i])
        i+=1

### IDTF
Using the frequency distribution we already have, find occurence of word in on subreddit / occurence of word in all

In [None]:
def idtf(fdist_all, fdist_subreddit, subreddit_name):
    words = fdist_all.keys()
    dictionary = {}
    for word in words:
        occurence_subreddit = fdist_subreddit[word]/fdist_all[word]
        if (occurence_subreddit > 0.5):
            dictionary[word] = occurence_subreddit
#             print("The word " + word + " occurs quite often in " + subreddit_name)
    sorted_dic = sorted(dictionary.items(), key=lambda kv: kv[1])
    print("The top 10 words for " + subreddit_name + ": ")
    print(sorted_dic[-10:])
    print("\n")

### calculate_avg_nb_words
Compute average amount of words per comment, and also prints the total number of youtube links in the entire subreddit.

In [None]:
def calculate_avg_nb_words(data):
    COUNT_COLUMN = "count words"
    COUNT_WORD_LENGTH_COLUMNS = "count word length"
    SENTIMENT = "sentiment"
    
    print("Computing average number of words per comment...")
    data[COUNT_COLUMN] = data.apply(lambda row: len(row.comments.split()), axis=1)
    data[COUNT_WORD_LENGTH_COLUMNS] = data.apply(lambda row: len(row.comments.replace(' ',''))/row[COUNT_COLUMN], axis=1)
    data[SENTIMENT] = compute_sentiment_of_comments(data["comments"])
    
    count_youtubelink, count_internetlink, count_emoticon = 0,0,0
    
    for subreddit in data.subreddits.unique():
        print("\n\n\t" + subreddit)
        # get data associated to subreddit
        subreddit_data = data.loc[data["subreddits"] == subreddit]
        
        # get sentiment of each comment
        subreddit_sentiment = subreddit_data[SENTIMENT].mean()
        print("Average sentiment:"+ str(subreddit_sentiment))
        
        # get mean of count of words 
        subreddit_mean_words = subreddit_data[COUNT_COLUMN].mean()
        print("Average words/comment:"+ str(subreddit_mean_words))
        
        # get mean length of all words
        subreddit_mean_word_length = subreddit_data[COUNT_WORD_LENGTH_COLUMNS].mean()
        print("Average length of words:"+ str(subreddit_mean_word_length))
        
        # find the nb of youtube links
        subreddit_count_youtubelink = subreddit_data["comments"].str.count(token_youtube_link).sum()
        print("nb of youtube links in " + subreddit + ":" + str(subreddit_count_youtubelink))
        #add to global youtube count
        count_youtubelink += subreddit_count_youtubelink
        
        # find the nb of internet links
        subreddit_count_internetlink = subreddit_data["comments"].str.count(token_internet_link).sum()
        print("nb of internet links in " + subreddit + ":" + str(subreddit_count_internetlink))
        #add to global youtube count
        count_internetlink += subreddit_count_internetlink
        
        # find the nb of emoticons
        subreddit_count_emoticon = subreddit_data["comments"].str.count(token_emoticon_funny).sum()
        print("nb of emoticons in" + subreddit + ":" + str(subreddit_count_emoticon))
        #add to global youtube count
        count_emoticon += subreddit_count_emoticon
        
    print("\n\n\t all subreddits")
    subreddit_sentiment = data[SENTIMENT].mean()
    print("Average sentiment:" + str(subreddit_sentiment))
    mean_words = data[COUNT_COLUMN].mean()
    print("Average words/comment:" + str(mean_words))
    mean_words = data[COUNT_WORD_LENGTH_COLUMNS].mean()
    print("Average length of word:" + str(mean_words))
    print("Total nb of youtube links:" + str(count_youtubelink))
    print("Total nb of internet links:" + str(count_internetlink))
    print("Total nb of emoticons:" + str(count_emoticon))

# MAIN

In [None]:
file_path = processed_dir_path + "/LEMMA_train_clean.csv"
print("Reading data file...")
data = pd.read_csv(file_path)

print("Computing statistics...") 
calculate_most_common_words(data)
calculate_avg_nb_words(data)

