## Project 3: Web APIs & Classification
---
Project notebook organisation:<br>
**1_Get_and_Clean_Data** (current notebook)<br>
<a href='./2_EDA_and_Preprocessing.ipynb'>2_EDA_and_Preprocessing</a><br>
<a href='./3_Models.ipynb'>3_Models</a><br>

---
### This notebook's layout
<a href='#gld'>Scraping from Subreddit:Gold</a><br>
<a href='#sil'>Scraping from Subreddit:Silverbugs</a><br>
<a href='#import'>Re-import and and check dataframes</a><br>
<a href='#duplicate'>Removes duplicates</a><br>
<a href='#rename'>Column filtering and renaming</a><br>
<a href='#miss'>Working on null/missing values</a><br>
<a href='#comment'>Remove common comment headers</a><br>
<a href='#rm_web'>Remove http and www</a><br>
<a href='#function'>Data cleaning and processing in a pre-defined function</a><br>

### Problem statement

I am a financial quantitative analyst with a Hedge fund company. Recently, our company is re-balancing our portfolio on<br> precious metals and I was tasked to leverage on data to identify any trends, sentiments in precious metals sector.<br> 

The first step is to use a classification models to differentiate between a gold and silver related Reddit post.<br> 
This is critical before we proceed to the next step<br>

Secondly, to examine Reddit's post and gaining insights to :

Which precious metal is best for investment purposes?

Which form of precious metals investment (CFDs, ETFs, futures, gold related stock, physical holdings)

Any other findings from post related stats like the number of comments of likes to a title post


In [35]:
import time
import pandas as pd
import requests
from numpy.random import randint
import numpy as np
import emoji
import re

from bs4 import BeautifulSoup 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
# Declare urls and header variables
url_gold = 'https://www.reddit.com/r/Gold.json'
url_silver = 'https://www.reddit.com/r/Silverbugs.json'

headers = {'User-agent':'my_agent007'}

<a id='gld'></a>
### Scraping from Subreddit:Gold

In [37]:
"""
# This cell is commented-out
# Scraping takes time and the data is not the same when replicated
# Intent here is to show the code in getting the data
gld_posts = []
after = None
 
for a in range(150):
    if after == None:
        current_url = url_gold
    else:
        current_url = url_gold + '?after=' + after
    res = requests.get(current_url, headers=headers)
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    gld_posts.extend(current_posts)
    after = current_dict['data']['after']
 
    #  random sleep time
    time.sleep(randint(1,3,1))
"""

"\n# This cell is commented-out\n# Scraping takes time and the data is not the same when replicated\n# Intent here is to show the code in getting the data\ngld_posts = []\nafter = None\n \nfor a in range(150):\n    if after == None:\n        current_url = url_gold\n    else:\n        current_url = url_gold + '?after=' + after\n    res = requests.get(current_url, headers=headers)\n    \n    if res.status_code != 200:\n        print('Status error', res.status_code)\n        break\n    \n    current_dict = res.json()\n    current_posts = [p['data'] for p in current_dict['data']['children']]\n    gld_posts.extend(current_posts)\n    after = current_dict['data']['after']\n \n    #  random sleep time\n    time.sleep(randint(1,3,1))\n"

<a id='sil'></a>
### Scraping from Subreddit:Silverbugs

In [38]:
"""
# This cell is commented-out
# Scraping takes time and the data is not the same when replicated
# Intent here is to show the code in getting the data
sil_posts = []
after = None
 
for a in range(150):
    if after == None:
        current_url = url_silver
    else:
        current_url = url_silver + '?after=' + after
    res = requests.get(current_url, headers=headers)
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    sil_posts.extend(current_posts)
    after = current_dict['data']['after']
 
    #  random sleep time
    time.sleep(randint(1,3,1))
"""

"\n# This cell is commented-out\n# Scraping takes time and the data is not the same when replicated\n# Intent here is to show the code in getting the data\nsil_posts = []\nafter = None\n \nfor a in range(150):\n    if after == None:\n        current_url = url_silver\n    else:\n        current_url = url_silver + '?after=' + after\n    res = requests.get(current_url, headers=headers)\n    \n    if res.status_code != 200:\n        print('Status error', res.status_code)\n        break\n    \n    current_dict = res.json()\n    current_posts = [p['data'] for p in current_dict['data']['children']]\n    sil_posts.extend(current_posts)\n    after = current_dict['data']['after']\n \n    #  random sleep time\n    time.sleep(randint(1,3,1))\n"

In [39]:
"""
# Checking the length after scrapping
print(len(gld_posts))
print(len(sil_posts))
"""

'\n# Checking the length after scrapping\nprint(len(gld_posts))\nprint(len(sil_posts))\n'

In [40]:
"""
# Convert scrapped data into dataframe
gold = pd.DataFrame(gld_posts)
silver = pd.DataFrame(sil_posts)
"""

'\n# Convert scrapped data into dataframe\ngold = pd.DataFrame(gld_posts)\nsilver = pd.DataFrame(sil_posts)\n'

In [41]:
"""
print(gold.shape)
print(silver.shape)
"""

'\nprint(gold.shape)\nprint(silver.shape)\n'

In [42]:
"""
# Export data to csv
gold.to_csv('../data/gold.csv', index = False)
silver.to_csv('../data/silver.csv', index = False)
"""

"\n# Export data to csv\ngold.to_csv('../data/gold.csv', index = False)\nsilver.to_csv('../data/silver.csv', index = False)\n"

<a id='import'></a>
### Re-import and and check dataframes

In [43]:
# Re-import both subreddits
gold = pd.read_csv('../data/gold.csv')
silver = pd.read_csv('../data/silver.csv')

In [44]:
# Checks the shape
print(gold.shape)
print(silver.shape)

(6159, 110)
(6223, 111)


<a id='duplicate'></a>
### Removes duplicates

Post are indexed via 'name' string variable<br>
will need to remove duplicated ones

In [45]:
# Checking duplicates base on 'name'
gold_duplicates_by_name = gold['name'].duplicated().value_counts()
gold_duplicates_by_name

True     5284
False     875
Name: name, dtype: int64

In [46]:
# Checking duplicates base on 'name'
silver_duplicates_by_name = silver['name'].duplicated().value_counts()
silver_duplicates_by_name

True     5182
False    1041
Name: name, dtype: int64

In [47]:
# The inverse of duplicates are the non-duplicates in which we will keep 
gold = gold[~gold['name'].duplicated()]
silver = silver[~silver['name'].duplicated()]

In [48]:
# Checks to ensure the filtering is done correctly
# 6159-5284 = 875
# 6223-5182 = 1041
print(gold.shape)
print(silver.shape)

(875, 110)
(1041, 111)


In [49]:
# Further checks by looking at subreddit counts
print(gold['subreddit'].value_counts())
print(silver['subreddit'].value_counts())

Gold    875
Name: subreddit, dtype: int64
Silverbugs    1041
Name: subreddit, dtype: int64


<a id='rename'></a>
### Column filtering and renaming

In [50]:
# Extract 8 features from gold
gold = gold[['name','author','title','selftext','permalink','num_comments','ups','subreddit']]

In [51]:
# Extract 8 features from silver
silver = silver[['name','author','title','selftext','permalink','num_comments','ups', 'subreddit']]

In [52]:
# Some columns need to be rename for easier understanding
new_column_name = ['post_id','author','title','post_body','comments','num_comments','ups', 'subreddit']

In [53]:
# Re-assign column names
gold.columns = new_column_name
silver.columns = new_column_name

In [54]:
# Check that the column re-naming is done correctly
print(gold.columns)
print(silver.columns)

Index(['post_id', 'author', 'title', 'post_body', 'comments', 'num_comments',
       'ups', 'subreddit'],
      dtype='object')
Index(['post_id', 'author', 'title', 'post_body', 'comments', 'num_comments',
       'ups', 'subreddit'],
      dtype='object')


In [55]:
print(gold.shape)
print(silver.shape)

(875, 8)
(1041, 8)


<a id='miss'></a>
### Working on null/missing values

In [56]:
# Checking null values across all columns
gold_isnull_val = gold.isnull().sum().sort_values(ascending = False)
silver_isnull_val = silver.isnull().sum().sort_values(ascending = False)
print(gold_isnull_val.head(4))
print(silver_isnull_val.head(4))

post_body       443
subreddit         0
ups               0
num_comments      0
dtype: int64
post_body       737
subreddit         0
ups               0
num_comments      0
dtype: int64


In [57]:
# re-assigns null values to 'None'
#gold.loc[gold['post_body'].isnull(), 'post_body'] = 'None'
#silver.loc[silver['post_body'].isnull(), 'post_body'] = 'None'

gold['post_body'].fillna(value = 'None', inplace=True)
silver['post_body'].fillna(value = 'None', inplace=True)

In [58]:
# Checking that NaN replaced with 'None'
print(gold_isnull_val.head(2))
print(silver_isnull_val.head(2))

post_body    443
subreddit      0
dtype: int64
post_body    737
subreddit      0
dtype: int64


<a id='comment'></a>
### Remove common comment headers

In [59]:
# The standard header indicates the start of a comments in every post's comment
# It is redundant and its data leaking in nature. It should be removed
# \w+\d*\w+|d*/ matches commentor id
# regex explanation : (1 word one or more) AND (1 digit zero or more) AND (1 word one or more|1 digit zero or more)

rm_gold_comment = '/r/Gold/comments/\w+\d*\w+|d*/'
rm_silver_comment = '/r/Silverbugs/comments/\w+\d*\w+|d*/'

# The number of 'comment header' is the same as the data's row
g_count = gold['comments'].str.contains('/r/Gold/comments/').sum()
s_count = silver['comments'].str.contains('/r/Silverbugs/comments/').sum()
print(g_count)
print(s_count)

875
1041


In [60]:
# Removing the'comment header' in both subreddit
gold['comments'] = gold['comments'].str.replace(rm_gold_comment, "")
silver['comments'] = silver['comments'].str.replace(rm_silver_comment, "")

In [61]:
# checks that the removal was done correctly
g_count = gold['comments'].str.contains('/r/Gold/comments/').sum()
s_count = silver['comments'].str.contains('/r/Silverbugs/comments/').sum()

print(g_count)
print(s_count)

0
0


<a id='rm_web'></a>
### Remove http and www

In [62]:
def rm_web(df):
    """Removes http,www on dataframe columns"""
    common_column_names = ['title','post_body','comments']
    for col_name in common_column_names:
        df[col_name] = df[col_name].replace(r"http\S+", '',regex=True)
        df[col_name] = df[col_name].replace(r"www\S+", '',regex=True)
    print('http and www elements removed')  

In [63]:
rm_web(gold)
rm_web(silver)

http and www elements removed
http and www elements removed


<a id='function'></a>
### Data cleaning and processing in a pre-defined function

In [64]:
def clean_data(raw_review):
    """ Performs many cleaning and processing in one function"""
        
    # - Remove HTML.
    review_text = BeautifulSoup(raw_review).get_text()
    
    # - Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # - Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+', ' ', letters_only)
   
    # - Convert to lower case, split into individual words.
    words = text.lower().split()
    
    # - In Python, searching a set is much faster than searching
    # a list, so convert the stopwords to a set.
    stops = set(stopwords.words('english'))
    
    # - Remove stopwords.
    stop_words = [w for w in words if not w in stops]
    
    # - Lemmatizer
    lemmatizer = WordNetLemmatizer()
    meaningful_words = [lemmatizer.lemmatize(w) for w in stop_words] 
  
    # - Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [65]:
def run_cleaner(df):
    """Run clean_data on dataframe columns"""
    common_column_names = ['title','post_body','comments']
    for col_name in common_column_names:
        df[col_name] = df[col_name].apply(clean_data)
    print('processing complete')  

In [66]:
run_cleaner(gold)

processing complete


In [67]:
run_cleaner(silver)

processing complete


In [68]:
gold.to_csv('../data/gold_clean.csv', index = False)
silver.to_csv('../data/silver_clean.csv', index = False)