In [7]:
import pandas as pd
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

### Pulling information from all of our subreddits into seperate dataframes

### askreddit

In [40]:
askreddit_df = pd.read_csv("./datasets/askreddit.csv").head(5000)

In [41]:
askreddit_df.shape

(5000, 65)

### explainlikeimfive

In [42]:
explainlikeimfive_df = pd.read_csv("./datasets/explainlikeimfive.csv").head(5000)

In [43]:
explainlikeimfive_df.shape

(5000, 69)

### amitheasshole

In [44]:
amitheasshole_df = pd.read_csv("./datasets/amitheasshole.csv").head(10000)

In [45]:
amitheasshole_df.shape

(10000, 65)

## moving all of the information into an aggregate dataset

In [46]:
aggregate_df = pd.concat([askreddit_df, explainlikeimfive_df, amitheasshole_df], axis = 0)

### the only information we care about is the title and subreddit so i'm making a dataframe that removes the removes the rest of the information

In [47]:
title_sub_df = aggregate_df[['title', 'subreddit']]

In [48]:
title_sub_df.head()

Unnamed: 0,title,subreddit
0,"People, who had found close and good relations...",AskReddit
1,What’s been your worst experience in a video g...,AskReddit
2,What's a piece of reddit history everyone shou...,AskReddit
3,What are some cultural Differences you've noti...,AskReddit
4,What’s been your worst experience in a video game,AskReddit


## Data Cleaning

### removing all numbers and useless punctuation <n>

#### we're removing numbers and punctuation so that we can find lemmatize full words instead of words that are tainted by the punctuation, and numbers are being removed due to the fact that almost all of them were being used as random one off numbers which had no effect on our models

In [49]:
title_sub_df['title'] = title_sub_df['title'].str.replace('[^\w\s]', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [50]:
title_sub_df['title'] = [''.join([n for n in sentence if not n.isdigit()]) for sentence in title_sub_df['title']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [51]:
title_sub_df['title'].head()

0    People who had found close and good relationsh...
1     Whats been your worst experience in a video game
2    Whats a piece of reddit history everyone shoul...
3    What are some cultural Differences youve notic...
4     Whats been your worst experience in a video game
Name: title, dtype: object

### just adjusting the data before any substantial data cleaning so that it can be read into my ending dataframe easier

In [52]:
title_sub_df['title'] = title_sub_df['title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [53]:
title_sub_df = title_sub_df.reset_index()

### data engineering a sentiment column, character count, and word count before lematization and transformation as they seem like they will be useful in our model creation <n>

In [54]:
sent = SentimentIntensityAnalyzer()
title_sub_df['sentiment'] = title_sub_df['title'].apply(lambda y: sent.polarity_scores(y)['compound'])

In [55]:
title_sub_df['char_count'] = title_sub_df['title'].apply(lambda y: len(y))

In [56]:
title_sub_df['word_count'] = title_sub_df['title'].apply(lambda y: len(word_tokenize(y)[:]))

In [57]:
lemmatizer = WordNetLemmatizer()

In [58]:
title_sub_df['title']  = [' '.join([lemmatizer.lemmatize(word) for word in sentence.split(' ')]) for sentence in title_sub_df['title']]

#### creating a vectorizer for the data so that it can fit into the models well, as i am using so much data i created a reasonably min_df for that amount of data 

In [59]:
vectorizer = CountVectorizer(stop_words = 'english', min_df = 100)

In [60]:
X = vectorizer.fit_transform(title_sub_df['title'])

In [61]:
count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())

In [62]:
count_vect_df = pd.concat([title_sub_df, count_vect_df], axis = 1)

In [63]:
count_vect_df = count_vect_df.loc[:, ~count_vect_df.columns.duplicated()]

#### This is a lazy method to remove overly powerful words that i came back to remove but didn't want to have to adjust the stopwords but if we were to refine this i would add this into the stopwords that get removed but since it was so few it didn't seem worth it

In [64]:
count_vect_df = count_vect_df.drop(['eli', 'aita', 'reddit', 'wibta'], axis = 1)

#### binarizing the data so that it can be used efficiently in our classification model

In [65]:
count_vect_df['subreddit'] = count_vect_df['subreddit'].apply(lambda y: 1 if y == 'AmItheAsshole' else 0)

#### last minute data cleaning again on the count_vect_df before it gets saved

In [66]:
count_vect_df = count_vect_df.drop(['index'], axis = 1)

#### Scaling the data so that it can be used well in logistic regression and naive bayesian classification

In [67]:
scaler = StandardScaler()

In [68]:
count_vect_df[['char_count', 'word_count']] = scaler.fit_transform(count_vect_df[['char_count', 'word_count']])

#### Saving these dataframes so that i can pull them again without having to rerun the whole data cleaning set

In [69]:
count_vect_df.to_csv('./datasets/count_vect_df.csv', index = False)

In [71]:
title_sub_df.drop(['index'], axis =1).to_csv('./datasets/title_sub_df.csv',index = False)