# Test Data Scraping and Cleaning

In this notebook we scrape and clean new data to test our model on.
The steps here are similar to notebooks 1 and 3. Do refer to them for explanations/annotations of the steps carried out here.

In [5]:
import pandas as pd
import requests
import time
import random
from bs4 import BeautifulSoup

In [6]:
import nltk
import regex as re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [7]:
url = 'https://www.reddit.com/r/iphone/new.json'
res = requests.get(url, headers={'User-agent': 'Zaini Inc'})

In [8]:
iphone_posts = []
after = None

for a in range(10):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Zaini Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    iphone_posts.extend(current_posts)
    after = current_dict['data']['after']
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(6,10)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/iphone/new.json
7
https://www.reddit.com/r/iphone/new.json?after=t3_k65ygm
10
https://www.reddit.com/r/iphone/new.json?after=t3_k51tyu
9
https://www.reddit.com/r/iphone/new.json?after=t3_k3dowr
8
https://www.reddit.com/r/iphone/new.json?after=t3_k1rnu3
8
https://www.reddit.com/r/iphone/new.json?after=t3_jznwal
8
https://www.reddit.com/r/iphone/new.json?after=t3_jyggjr
7
https://www.reddit.com/r/iphone/new.json?after=t3_jxt25m
10
https://www.reddit.com/r/iphone/new.json?after=t3_jwt1qq
8
https://www.reddit.com/r/iphone/new.json?after=t3_jvj50u
10


In [9]:
len(iphone_posts)

250

In [10]:
iphone_df = pd.DataFrame(iphone_posts)

In [11]:
iphone_df.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,crosspost_parent_list,crosspost_parent,media_metadata
0,,iphone,,t2_5whi1gxg,False,,0,False,Here's how you can download any file on your i...,[],...,False,http://arcanelostcom.wpcomstaging.com/download...,2818664,1607109000.0,0,,False,,,
1,,iphone,,t2_382egh3u,False,,0,False,Discussion With iPhone 12 Pro and iPhone 12 Pr...,[],...,False,https://shiv1367.com/iphone-12-pro-max-and-iph...,2818664,1607102000.0,0,,False,,,
2,,iphone,,t2_th4cg,False,,0,False,iPhone 12 wireless charging problem – fix on t...,[],...,False,https://9to5mac.com/2020/12/04/iphone-12-wirel...,2818664,1607087000.0,0,,False,,,
3,,iphone,Welcome to the Daily Tech Support thread for ...,t2_6l4z3,False,,0,False,Daily Tech Support Thread,[],...,True,https://www.reddit.com/r/iphone/comments/k6ivi...,2818664,1607080000.0,0,,False,,,
4,,iphone,Welcome to the weekly stickied WSIB thread. ...,t2_6l4z3,False,,0,False,Weekly What Should I Buy Thread,[],...,True,https://www.reddit.com/r/iphone/comments/k6ivi...,2818664,1607080000.0,0,,False,,,


In [12]:
#clean clean clean

In [13]:
iphone_df['selftext'].replace(r'[^\w\s]', ' ', regex=True, inplace = True)
iphone_df['title'].replace(r'[^\w\s]', ' ', regex=True, inplace = True)
iphone_df.fillna(value = ' ', inplace = True)

In [14]:
iphone_df['title/text'] = iphone_df['selftext'] + ' ' + iphone_df['title']

iphone_df = iphone_df[['author_fullname','title','selftext', 'title/text', 'subreddit']] 

iphone_df

Unnamed: 0,author_fullname,title,selftext,title/text,subreddit
0,t2_5whi1gxg,Here s how you can download any file on your i...,,Here s how you can download any file on your ...,iphone
1,t2_382egh3u,Discussion With iPhone 12 Pro and iPhone 12 Pr...,,Discussion With iPhone 12 Pro and iPhone 12 P...,iphone
2,t2_th4cg,iPhone 12 wireless charging problem fix on t...,,iPhone 12 wireless charging problem fix on ...,iphone
3,t2_6l4z3,Daily Tech Support Thread,Welcome to the Daily Tech Support thread for ...,Welcome to the Daily Tech Support thread for ...,iphone
4,t2_6l4z3,Weekly What Should I Buy Thread,Welcome to the weekly stickied WSIB thread ...,Welcome to the weekly stickied WSIB thread ...,iphone
...,...,...,...,...,...
245,t2_ay4ik,iPhone 12 Pro Max leather case,Got my 12 pro max with Apple leather case \n...,Got my 12 pro max with Apple leather case \n...,iphone
246,t2_6l4z3,Daily Tech Support Thread November 15,Welcome to the Daily Tech Support thread for ...,Welcome to the Daily Tech Support thread for ...,iphone
247,t2_9xqhj,My MagSafe charger gets really warm after bein...,So I got my 12 Pro Max yesterday and the MagSa...,So I got my 12 Pro Max yesterday and the MagSa...,iphone
248,t2_1svtzo4x,Feature iPhone now allows you to name Blueto...,,Feature iPhone now allows you to name Bluet...,iphone


In [15]:
iPhone_filtered = iphone_df.drop_duplicates(subset = ['title/text'], keep = 'first')

iPhone_filtered = iPhone_filtered[['title/text', 'subreddit']]

iPhone_filtered.shape

(250, 2)

In [16]:
url = 'https://www.reddit.com/r/Android/new.json'
res = requests.get(url, headers={'User-agent': 'Not Zaini Inc'})

In [18]:
android_posts = []
after = None

for a in range(10):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Zaini Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    android_posts.extend(current_posts)
    after = current_dict['data']['after']
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(6,10)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/Android/new.json
7
https://www.reddit.com/r/Android/new.json?after=t3_k5ay9a
7
https://www.reddit.com/r/Android/new.json?after=t3_k3txi1
7
https://www.reddit.com/r/Android/new.json?after=t3_k1o0rz
8
https://www.reddit.com/r/Android/new.json?after=t3_k0sbzx
6
https://www.reddit.com/r/Android/new.json?after=t3_jy9sae
10
https://www.reddit.com/r/Android/new.json?after=t3_jx3e2s
10
https://www.reddit.com/r/Android/new.json?after=t3_jw6g8k
10
https://www.reddit.com/r/Android/new.json?after=t3_jut90q
8
https://www.reddit.com/r/Android/new.json?after=t3_jtdw6f
8


In [19]:
type(android_posts)

list

In [20]:
len(android_posts)

250

In [21]:
android_df = pd.DataFrame(android_posts)
android_df.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,link_flair_template_id,url_overridden_by_dest
0,,Android,**Samsung Buy Back Guranteed Buy Back Program...,t2_14auav,False,,0,False,Samsung S20 Guaranteed Buy Back Program Change...,[],...,all_ads,False,https://www.reddit.com/r/Android/comments/k6nv...,2281885,1607099000.0,0,,False,,
1,,Android,it's kinda neat,t2_lxrm6,False,,0,False,TIL a little Gboard trick. Select a word you'v...,[],...,all_ads,False,https://www.reddit.com/r/Android/comments/k6ea...,2281885,1607058000.0,0,,False,,
2,,Android,[Bose QC 35 IIs Promotion - Product Documenta...,t2_88cnb8an,False,,0,False,You should be able to claim those 'free' Bose ...,[],...,all_ads,False,https://www.reddit.com/r/Android/comments/k64r...,2281885,1607025000.0,0,,False,8eb9cbac-413c-11e3-ab4c-12313b04ceaf,
3,,Android,,t2_172iji,False,,0,False,2021 Smartphone cameras - What to expect!,[],...,all_ads,False,https://www.youtube.com/watch?v=AdaOJw2BLQg,2281885,1607095000.0,0,"{'type': 'youtube.com', 'oembed': {'provider_u...",False,,https://www.youtube.com/watch?v=AdaOJw2BLQg
4,,Android,,t2_kvvva,False,,0,False,"""Pixel 5 Teardown - ITS ALIVE! (kinda..)""",[],...,all_ads,False,https://www.youtube.com/watch?v=3KMdkC536vs,2281885,1607092000.0,0,"{'type': 'youtube.com', 'oembed': {'provider_u...",False,,https://www.youtube.com/watch?v=3KMdkC536vs


In [22]:
android_df['selftext'].replace(r'[^\w\s]', ' ', regex=True, inplace = True)

android_df['title'].replace(r'[^\w\s]', ' ', regex=True, inplace = True)

android_df.fillna(value = ' ', inplace = True)

In [23]:
android_df['title/text'] = android_df['selftext'] + ' ' + android_df['title']

android_df = android_df[['author_fullname','title','selftext', 'title/text', 'subreddit']] 

android_df

Unnamed: 0,author_fullname,title,selftext,title/text,subreddit
0,t2_14auav,Samsung S20 Guaranteed Buy Back Program Change...,Samsung Buy Back Guranteed Buy Back Program...,Samsung Buy Back Guranteed Buy Back Program...,Android
1,t2_lxrm6,TIL a little Gboard trick Select a word you v...,it s kinda neat,it s kinda neat TIL a little Gboard trick Sel...,Android
2,t2_88cnb8an,You should be able to claim those free Bose ...,Bose QC 35 IIs Promotion Product Documenta...,Bose QC 35 IIs Promotion Product Documenta...,Android
3,t2_172iji,2021 Smartphone cameras What to expect,,2021 Smartphone cameras What to expect,Android
4,t2_kvvva,Pixel 5 Teardown ITS ALIVE kinda,,Pixel 5 Teardown ITS ALIVE kinda,Android
...,...,...,...,...,...
245,t2_772lrq6s,HTC sales nearly halved last month Taipei Times,,HTC sales nearly halved last month Taipei T...,Android
246,t2_zh1pu,I tested the Note 20 Ultra Exynos 990 model an...,You won t believe in my country they only sell...,You won t believe in my country they only sell...,Android
247,t2_2dbyebba,Mysterious Bugs Were Used to Hack iPhones and ...,,Mysterious Bugs Were Used to Hack iPhones and...,Android
248,t2_7k499ptf,Android enthusiasts Do agree or disagree with...,What happens when designers prioritize aesthe...,What happens when designers prioritize aesthe...,Android


In [24]:
android_filtered = android_df.drop_duplicates(subset = ['title/text'], keep = 'first')

android_filtered = android_filtered[['title/text', 'subreddit']]

android_filtered.shape

(250, 2)

In [25]:
combined_test = pd.concat([android_filtered, iPhone_filtered])

In [26]:
combined_test.shape

(500, 2)

In [27]:
combined_test.reset_index(drop = True, inplace = True)

In [28]:
combined_test.replace({'subreddit': {'iphone': 1, 'Android': 0}}, inplace = True)

In [29]:
combined_test.head(3)

Unnamed: 0,title/text,subreddit
0,Samsung Buy Back Guranteed Buy Back Program...,0
1,it s kinda neat TIL a little Gboard trick Sel...,0
2,Bose QC 35 IIs Promotion Product Documenta...,0


In [30]:
combined_test.tail(3)

Unnamed: 0,title/text,subreddit
497,So I got my 12 Pro Max yesterday and the MagSa...,1
498,Feature iPhone now allows you to name Bluet...,1
499,Doesn t have to be MagSafe or have wireless ch...,1


In [31]:
stops = set(stopwords.words('english'))
len(stops)

179

In [32]:
new_stops = {'iphone', 'iphones', 'android', 'phone', 
             'http', 'amp', 'www', 'com', 'thread', 'reddit'}

len(new_stops)

10

In [33]:
updated_stops = stops.union(new_stops)
len(updated_stops)

189

In [34]:
def review_to_words(raw_review, stoplist):
    
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review)
    
    words = letters_only.lower().split()
    
    meaningful_words = [w for w in words if w not in stoplist]

    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word) for word in meaningful_words]

    return(" ".join(lemmas))

In [35]:
combined_test['title/text'][1]

'it s kinda neat TIL a little Gboard trick  Select a word you ve typed and hit the shift key a couple of times and NOW IT S ALL CAPS '

In [36]:
review_to_words(combined_test['title/text'][1], updated_stops)

'kinda neat til little gboard trick select word typed hit shift key couple time cap'

In [37]:
cleaned_text = []

for post in combined_test['title/text']:
    
    cleaned_text.append(review_to_words(post, updated_stops))
    
cleaned_text[:3]

['samsung buy back guranteed buy back program samsung ran promotion called guaranteed buy back program galaxy series release http samsung u mobile question http slickdeals net sdtid sdop sdpid sdfid lno trd http samsung u mobile pv au u http f fwww samsung fus fmobile fbuy back ffrequently asked question f basically guaranteed half retail price back cash return good condition within month purchased samsung galaxy ultra samsung u released checked samsung website saw return date reduced month month without notice customer say need return december receive credit back also changed whole term half purchase price half retail price return value significantly le promised beginning advise everyone purchased buy back program samsung check order online make sure original invoice return date return value written might need later sure whether legal samsung change term condition purchase made past look pretty unethical original term condition http web archive org web http samsung u mobile buy back t

In [38]:
len(cleaned_text)

500

In [39]:
combined_test['cleaned_text'] = cleaned_text

In [40]:
combined_test.sample(6)

Unnamed: 0,title/text,subreddit,cleaned_text
16,Files by Google will soon get a Trash folder ...,0,file google soon get trash folder help restore...
221,Fujitsu arrows nx9 f 52a anounces in Japan,0,fujitsu arrow nx f anounces japan
302,I m kind of stuck between the SE2020 and 12min...,1,kind stuck se mini noticed several post saying...
211,Exclusive OnePlus 9 CAD render reveals trip...,0,exclusive oneplus cad render reveals triple ca...
354,Update at the bottom \n\nHello all Wanted to...,1,update bottom hello wanted write forget event ...
68,Parliament wants to grant EU consumers a rig...,0,parliament want grant eu consumer right repair...


In [41]:
combined_test[['cleaned_text', 'subreddit']].to_csv('datasets/combined_test.csv')