# Data Analysis
This file will contains all initial analysis of the data and plot it.

In [12]:
import numpy as np
import pandas as pd
import nltk
from nltk import FreqDist
from config import raw_data_dir_path
from nltk.corpus import stopwords
set(stopwords.words('english'))

YOUTUBE_KEYWORD = "youtubelink"

### calculate_most_common_words
Find the most recurrent words for each subreddit. The comments are cleaned beforehand (stopwords and punctuation removed)

In [13]:
def calculate_most_common_words(data, top_nb = 5):
    print("Computing most common words...")
    
    fdist_allsubs = FreqDist()
    list_subreddit_freq = []
    subreddit_names = data.subreddits.unique()
    
    for subreddit in subreddit_names:
        print("\t" + subreddit)
        
        # get data associated to subreddit
        subreddit_data = data.loc[data["subreddits"] == subreddit]
        
        # tokenize
        subreddit_comments = nltk.tokenize.word_tokenize(subreddit_data["comments"].str.cat(sep="\n"))
        # remove stopswords and punctuation
        subreddit_comments = [word for word in subreddit_comments if (word not in stopwords.words('english') and word.isalpha())]
        
        # get frequency distribution of all words
        fdist_subreddit = nltk.FreqDist(subreddit_comments)
        
        fdist_subreddit.plot(5, cumulative = False)
        plt.show()
        
        print(fdist_subreddit)
        print(fdist_subreddit.most_common(top_nb))
        list_subreddit_freq.append(fdist_subreddit)
        fdist_allsubs += fdist_subreddit
    
    print("\tall subreddits")
    print(fdist_allsubs)
    print(fdist_allsubs.most_common(top_nb))
    
    i = 0
    for subreddit_freq in list_subreddit_freq:
        idtf(fdist_allsubs, subreddit_freq, subreddit_names[i])
        i+=1

### IDTF
Using the frequency distribution we already have, find occurence of word in on subreddit / occurence of word in all

In [14]:
def idtf(fdist_all, fdist_subreddit, subreddit_name):
    words = fdist_all.keys()
    dictionary = {}
    for word in words:
        occurence_subreddit = fdist_subreddit[word]/fdist_all[word]
        if (occurence_subreddit > 0.5):
            dictionary[word] = occurence_subreddit
#             print("The word " + word + " occurs quite often in " + subreddit_name)
    sorted_dic = sorted(dictionary.items(), key=lambda kv: kv[1])
    print("The top 10 words for " + subreddit_name + ": \n\t")
    print(sorted_dic[-10:])

### calculate_avg_nb_words
Compute average amount of words per comment, and also prints the total number of youtube links in the entire subreddit.

In [15]:
def calculate_avg_nb_words(data):
    COUNT_COLUMN = "count words"
    COUNT_YOUTUBE_COLUMN = "count youtube"
    
    print("Computing average number of words per comment...")
    data[COUNT_COLUMN] = data.apply(lambda row: len(row.comments), axis=1)
    count_youtube_comment = 0
    
    for subreddit in data.subreddits.unique():
        print("\t" + subreddit)
        # get data associated to subreddit
        subreddit_data = data.loc[data["subreddits"] == subreddit]
        
        # get mean of count of words 
        subreddit_mean_words = subreddit_data[COUNT_COLUMN].mean()
        print(subreddit_mean_words)
        
        # find the nb of youtube links and divide by nb of comments (if data is processed, can just get nb of "youtube")
        subreddit_count_youtube_comment = subreddit_data["comments"].str.count(YOUTUBE_KEYWORD).sum()
        print("nb of youtube comment:" + str(subreddit_count_youtube_comment))
        #add to global youtube count
        count_youtube_comment += subreddit_count_youtube_comment
        
    print("\t all subreddits")
    # get mean of count of words 
    mean_words = data[COUNT_COLUMN].mean()
    print(mean_words)
    print("total nb of youtube comment:" + str(count_youtube_comment))

# MAIN

In [None]:
file_path = raw_data_dir_path + "/LEMMA_train_raw_clean.csv"
print("Reading data file...")
data = pd.read_csv(file_path)

print("Computing statistics...") 
calculate_most_common_words(data)
calculate_avg_nb_words(data)


#     TODO: scoring function