## Reddit EDA

##### Author: Vala Rahmani

#### Exploratory Data Analysis

Importing the packages

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

**Importing the data**

In [2]:
## **********************************Vegan Paleo Data ************************************##
paleo = pd.read_csv('dataset/paleo_posts.csv')
vegan = pd.read_csv('dataset/vegan_posts.csv')
data  = pd.concat([paleo, vegan])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [3]:
data.shape

(4848, 104)

Dropping all the duplicates based on title that are in our data 

In [4]:
# Dropping all the duplicate titles from the dataset.
data.drop_duplicates(subset = ['title'], keep = 'last',inplace = True)

In [5]:
data.shape

(1783, 104)

In [6]:
data['selftext'].isnull().sum()

1012

In [7]:
data['subreddit'].value_counts()

vegan    1145
Paleo     638
Name: subreddit, dtype: int64

In [8]:
# data['selftext'] = data['selftext'].fillna(value = 'notext')

In [9]:
# Columns that are usable in the 
usable_columns = ['id','author', 'is_video','name','num_comments',
                  'score','selftext','subreddit','title','ups']



In [10]:
data[usable_columns].isnull().sum()

id                 0
author             0
is_video           0
name               0
num_comments       0
score              0
selftext        1012
subreddit          0
title              0
ups                0
dtype: int64

In [11]:
data = data[usable_columns]

In [12]:
#Get all the interested data except the target column
features = [i for i in usable_columns if i != 'subreddit']

In [13]:
data[features].head(2)

Unnamed: 0,id,author,is_video,name,num_comments,score,selftext,title,ups
673,a0k6zx,arav24,False,t3_a0k6zx,0,0,Hi guys! I am majoring in Nutritional Science ...,I'm a 18 year old aspiring blogger [blogspam],0
684,cb2t2d,techguySF,False,t3_cb2t2d,1,0,,Did Bone Marrow make us Human? [Discussion],0


In [14]:
data.describe()

Unnamed: 0,num_comments,score,ups
count,1783.0,1783.0,1783.0
mean,13.61189,94.482894,94.482894
std,31.998925,348.249139,348.249139
min,0.0,0.0,0.0
25%,2.0,3.0,3.0
50%,7.0,13.0,13.0
75%,15.0,57.0,57.0
max,655.0,4937.0,4937.0


Formatting the data
* For the number of the comments I will be using the median as the threshold therefore if the number of the comments is greater than 6 num_comments will be one and if less than 6 will be 0 

In [15]:
data['num_comments'].median()

7.0

For the posts that get more than the median comment a value of 1 is given and a value of 0 for less comments

In [16]:
 data['num_comments'] = (data['num_comments']> data['num_comments'].median()).astype(int)

In [17]:
data= pd.get_dummies(data, columns=['subreddit'], drop_first=True)

64% of the data is from the Vegan subreddit and the 36% from the Paleo subreddit

In [18]:
data['subreddit_vegan'].value_counts(normalize=True)

1    0.642176
0    0.357824
Name: subreddit_vegan, dtype: float64

### Looking at the data to better understand the content of each post

In [19]:
data.shape

(1783, 10)

In [20]:
data.columns

Index(['id', 'author', 'is_video', 'name', 'num_comments', 'score', 'selftext',
       'title', 'ups', 'subreddit_vegan'],
      dtype='object')

In [21]:
data['selftext'] = data['selftext'].fillna('')

In [22]:
# Since there are a lot of rows missing selftext, selftext is added to the title to create a row with most content possible.
data['title_selftext'] = data['title'] + " " + data['selftext']

In [23]:
data['title_selftext'].isna().sum()

0

## Data Cleaning

In [1]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import regex as re
from nltk.tokenize import RegexpTokenizer


def text_cleaner(text):
    
    # 1. Remove the html from the text to make the model more robust
    nohtml = BeautifulSoup(text).get_text()
    
    # 2. Remove the non-letter characters from the text
    text_letters = re.sub("[^a-zA-Z]", " ", nohtml)
    
    # 3. Making everything lower case 
    lower_case = text_letters.lower()
    
    # 4. tokenizing based on the spaces
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    words = tokenizer.tokenize(lower_case)
    # 5. Removing the stop words
    stopwords_list = stopwords.words('english')
    # Adding some of the words that may leak data into our model
    stopwords_list.extend(['paleo','vegan', 'needing help','needing','vegans','question','discussion','food pic','pic','food','blogspam'])
    
    stops = set(stopwords_list)
    words_nostop = [w for w in words if w not in stops]
    
    
    return (" ".join(words_nostop))

In [25]:
import time
start_time = time.time()
print('Cleaning the title columns for each post')
print(f"There are total of {data.shape[0]} titles")
counter= 0 
data['clean_title'] = data['title'].apply(text_cleaner)
print("Title Cleaning is complete!")
data['clean_title_selftext'] = data['title_selftext'].apply(text_cleaner)
print("Title_Selftext Cleaning is also complete!")
print("--- Took %s seconds ---" % (time.time() - start_time))




# for text in data['title']:
#     clean_titles.app(text_cleaner(text))
#     data['clean_title1'][counter]= text_cleaner(text)
#     if (counter+1) % 100 == 0:
#         print(f"{counter+1} of {data.shape[0]} titles have been cleaned up")
#     counter += 1

Cleaning the title columns for each post
There are total of 1783 titles
Title Cleaning is complete!
Title_Selftext Cleaning is also complete!
--- Took 1.5788700580596924 seconds ---


#### Save the cleaned up data to use in the modeling notebook

In [26]:
print(data['clean_title'].isna().sum())
print(data['clean_title_selftext'].isna().sum())


0
0


In [43]:
data.head(1)

Unnamed: 0,id,author,is_video,name,num_comments,score,selftext,title,ups,subreddit_vegan,title_selftext,clean_title,clean_title_selftext
673,a0k6zx,arav24,False,t3_a0k6zx,0,0,Hi guys! I am majoring in Nutritional Science ...,I'm a 18 year old aspiring blogger [blogspam],0,0,I'm a 18 year old aspiring blogger [blogspam] ...,year old aspiring blogger,year old aspiring blogger hi guys majoring nut...


In [45]:
data.to_csv('dataset/clean_data_vegan.csv',index=False, encoding='utf-8')

## Basic modeling to get a feel for the data

*Let's add Tfidf to our model*

In [None]:
data_saved = pd.read_csv('dataset/clean_data_vegan.csv')

In [None]:
data['clean_title_selftext'].isnull().sum()

In [33]:
#********************************** Vegan Paleo Train Test Split *******************************#
from sklearn.model_selection import train_test_split
features = [i for i in data.columns if i!='subreddit_vegan']
X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                data['subreddit_vegan'],
                                                random_state=42,
                                                stratify=data['subreddit_vegan'])

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

cvec = CountVectorizer(ngram_range=(1,2),
                       analyzer = "word",
                       max_features=2500,
                       tokenizer=None,
                       stop_words=None)

cvec_train_features = cvec.fit_transform(X_train['clean_title'])
cvec_test_features = cvec.transform(X_test['clean_title'])

In [35]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic.fit(cvec_train_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [36]:
print('The accuracy of the model is as following')
print(f"Train score:{logistic.score(cvec_train_features,y_train)}")
print(f"Test score:{logistic.score(cvec_test_features, y_test)}")

The accuracy of the model is as following
Train score:0.9401645474943904
Test score:0.757847533632287


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2),
                       analyzer = "word",
                       max_features=2500,
                       tokenizer=None,
                       stop_words=None)

tfidf_train_features = tfidf.fit_transform(X_train['clean_title'])
tfidf_test_features = tfidf.transform(X_test['clean_title'])

In [38]:
logistic = LogisticRegression()

In [39]:
logistic.fit(tfidf_train_features,y_train)
print(logistic.score(tfidf_train_features,y_train))
print(logistic.score(tfidf_test_features,y_test))

0.8548990276738968
0.742152466367713


In [40]:
tfidf_df = pd.DataFrame(tfidf_train_features.toarray(), columns = tfidf.get_feature_names())

In [42]:
tfidf_df['chicken'].value_counts()

0.000000    1302
0.250882       1
0.187708       1
0.246705       1
0.227592       1
0.342806       1
0.302914       1
0.384680       1
0.349897       1
0.255169       1
0.259914       1
0.122844       1
0.368082       1
0.278289       1
0.283096       1
0.184880       1
1.000000       1
0.385045       1
0.160557       1
0.283369       1
0.486920       1
0.246527       1
0.450651       1
0.381005       1
0.209634       1
0.266910       1
0.422553       1
0.423372       1
0.160042       1
0.226920       1
0.362818       1
0.446576       1
0.453762       1
0.289118       1
0.188220       1
0.315688       1
Name: chicken, dtype: int64