# Reddit Post Classifier: EDA

In [1]:
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import CountVectorizer
english = set(nltk.corpus.words.words())

import plotly.express as px

## 1. Getting Data

We started with two tables of data from Kaggle: https://www.kaggle.com/mswarbrickjones/reddit-selfposts

1. [table of 3394 subreddits, categorized]('data/subreddit_info.csv')
2. [table of 1,033,000 text posts from 1,033 subreddits (1000 posts per subreddit)]('https://kaggle.com/mswarbrickjones/reddit-selfposts#rspct.tsv')

All data is from 2016/06/01 to 2018/06/01.

### a. Getting the relevant subreddits

In [None]:
subreddits = pd.read_csv('data/subreddit_info.csv')
subreddits.head()

In [None]:
# 1. keeping only subreddits that are 'in_data'
subreddits = subreddits[subreddits['in_data'] == True]

# 2. keeping only relevant columns
subreddits = subreddits.drop(['reason_for_exclusion', 'category_2', 'category_3', 'in_data'], axis = 1)

# 3. keeping only arts and programming subreddits
arts = subreddits[subreddits['category_1']=='arts']
programming = subreddits[subreddits['category_1']=='programming']

### b. Getting the posts from relevant subreddits

In [None]:
# this tsv is available on kaggle, linked above

# # 1. reading in the tsv
# posts = pd.read_csv('rspct.tsv', sep='\t')

# # 2. getting lists of the relevant subreddits
# arts_subreddit_list = list(arts['subreddit'])
# programming_subreddit_list = list(programming['subreddit'])

# # 3. getting the posts into a dataframe
# arts_list = [posts[posts['subreddit'] == subreddit] for subreddit in arts_subreddit_list]
# arts_posts = pd.concat(arts_list)

# programming_list = [posts[posts['subreddit'] == subreddit] for subreddit in programming_subreddit_list]
# programming_posts = pd.concat(programming_list)

# # 4. labelling data and dropping columns
# arts_posts['label'] = 0
# programming_posts['label'] = 1
# raw = pd.concat([arts_posts, programming_posts]).reset_index().drop(['index', 'id'], axis=1)

## 2. Cleaning/Prepping Data

In [None]:
# # 1. combining posts' titles with their body of text
# raw['text'] = raw['title'] + " " + raw['selftext']
# posts = raw.drop(['title', 'selftext', 'subreddit'], axis=1)

# # 2. dumping the cleaned data into its own csv
# # posts.to_csv('arts-programming-reddit-posts.csv')


In [None]:
# 3. prepping data for NLP (tokenize, filter non-english, lemmatize, CountVectorize)
posts = pd.read_csv('data/arts-programming-reddit-posts.csv', index_col = None).drop('Unnamed: 0', axis=1)

def lemmadata(doc):
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    raw_tokens = nltk.regexp_tokenize(doc, pattern)
    tokens = [i.lower() for i in raw_tokens]
    stop_words = set(stopwords.words('english'))
    listed = [w for w in tokens if not w in stop_words]
    lemmatized = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in listed]
    lemmatized = list(filter(lambda w: w != 'lb', lemmatized))
    words = list(filter(lambda w: w in english, lemmatized))
    return " ".join(words)

lemmatized = pd.DataFrame([lemmadata(post) for post in list(posts['selftext'])])

vec = CountVectorizer()
X = vec.fit_transform(lemmatized[0])
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

## 3. Exploratory Data Analysis

In [None]:
counts = dict(df.sum())

In [None]:
top100 = sorted(counts.items(), key=lambda x: x[1], reverse=True)

In [None]:
posts