In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Import DataFrames

In [7]:
df_insta = pd.read_csv("../datasets/df_insta.csv", parse_dates=["date_time"])
df_insta.shape

(14488, 22)

In [8]:
df_insta.head(3)

Unnamed: 0,post_id,post_slug,unix_time,date_time,post_caption,hashtags,topic_tags,is_video,is_ad,post_likes,...,owner_id,owner_verified,owner_privacy,owner_unpublished,owner_total_posts,owner_total_followers,at_tags,useful_tags,tags,content
0,1006641971264095607,34T4oZMgV3,1434221095,2015-06-13 18:44:55,oh look natural heart shaped form of ivan heng...,"['#PinkDotSg', '#WhereLovesLiveSg', '#Rediscov...",['2 people.'],False,False,40,...,196778200,False,False,False,5205,1232,0,0,0,oh look natural heart shaped form of ivan heng...
1,1202880298563729182,BCxfUInNPce,1457614527,2016-03-10 12:55:27,#exploresingapore #instasg #gf_singapore #game...,"['#exploresingapore', '#instasg', '#gf_singapo...",['indoor.'],False,False,33,...,33123388,False,False,False,742,995,0,indoor,indoor,#exploresingapore #instasg #gf_singapore #game...
2,677102977071319026,lljZH4sgfy,1394936986,2014-03-16 02:29:46,on the road to hell reliving haw par villa 15 ...,"['#rediscoversg', '#Singapore']",['Photo by Belinda Tan in 虎豹别墅.'],False,False,3,...,196778200,False,False,False,5205,1232,0,0,0,on the road to hell reliving haw par villa 15 ...


In [11]:
df_yt_sel = pd.read_csv("../datasets/df_yt_sel.csv", parse_dates=["date_time"])
df_yt_sel.shape

(248, 10)

In [12]:
df_yt_sel.head(3)

Unnamed: 0,video_title,video_caption,date_time,video_slug,video_views,video_likes,video_dislikes,video_comments,content,channel
0,HALLOWEEN HORROR NIGHTS 8 AT UNIVERSAL STUDIOS! 👻,['UNIVERSAL HALLOWEEN HORROR NIGHTS 8! 👻\n\nHa...,2018-10-31,7xkdm3c4Ks8,8766,271,3,18 Comments,HALLOWEEN HORROR NIGHTS 8 AT UNIVERSAL STUDIOS...,GC
1,Eating the BEST rated PRATA in SINGAPORE! 🇸🇬 *...,['Eating the BEST rated PRATA in SINGAPORE! 🇸🇬...,2019-11-17,b-ppBtSiG38,33483,649,19,159 Comments,Eating the BEST rated PRATA in SINGAPORE! 🇸🇬 *...,GC
2,McDoanlds Hokkaido Salmon Burger In Singapore!,['TRYING MCDONALDS HOKKAIDO SALMON BURGER IN S...,2018-05-13,IDXoNb548ec,13809,430,13,117 Comments,McDoanlds Hokkaido Salmon Burger In Singapore!...,GC


In [14]:
df_yt_comments_sel = pd.read_csv("../datasets/df_yt_comments_sel.csv")
df_yt_comments_sel.shape

(10626, 7)

In [15]:
df_yt_comments_sel.head(3)

Unnamed: 0,response_to,user,timestamp,comment,likes,replies_attracted,channel
0,HALLOWEEN HORROR NIGHTS 8 AT UNIVERSAL STUDIOS! 👻,thattham,1 year ago,"I was a scare actor in killuminati, we hope u ...",4,CLAIM NOW!,GC
1,McDoanlds Hokkaido Salmon Burger In Singapore!,Whyywinter.,2 years ago,Hey friends! 👋🏻 What’s your thoughts on Mcdona...,35,View 10 replies,GC
2,McDoanlds Hokkaido Salmon Burger In Singapore!,CT,2 years ago,Is it even possible to dislike seaweed shaker ...,51,,GC


### Likes Topography

In [None]:
# since we are using likes as our target variable
# it is important to understand the distribution of our target variable
# and if there are any correlation between our target variable and other meta-data
# as well as consider if any sort of qualification is needed when comparing likes across platforms

#### Instagram Likes

In [19]:
df_insta.head(1)

Unnamed: 0,post_id,post_slug,unix_time,date_time,post_caption,hashtags,topic_tags,is_video,is_ad,post_likes,...,owner_id,owner_verified,owner_privacy,owner_unpublished,owner_total_posts,owner_total_followers,at_tags,useful_tags,tags,content
0,1006641971264095607,34T4oZMgV3,1434221095,2015-06-13 18:44:55,oh look natural heart shaped form of ivan heng...,"['#PinkDotSg', '#WhereLovesLiveSg', '#Rediscov...",['2 people.'],False,False,40,...,196778200,False,False,False,5205,1232,0,0,0,oh look natural heart shaped form of ivan heng...


In [20]:
df_insta.columns

Index(['post_id', 'post_slug', 'unix_time', 'date_time', 'post_caption',
       'hashtags', 'topic_tags', 'is_video', 'is_ad', 'post_likes', 'geo_tag',
       'geo_slug', 'owner_id', 'owner_verified', 'owner_privacy',
       'owner_unpublished', 'owner_total_posts', 'owner_total_followers',
       'at_tags', 'useful_tags', 'tags', 'content'],
      dtype='object')

In [18]:
df_insta["post_likes"].describe()

count    14488.000000
mean       122.552112
std        344.185959
min          0.000000
25%         18.000000
50%         40.000000
75%         95.000000
max      11469.000000
Name: post_likes, dtype: float64

In [None]:
# let's create 2 new features to evaluate likes against
### age of posts - from time of scrape to date of post
### no. of hashtags used

In [32]:
# posts were scraped over 5 days
# let's take the last date of scrape on 2020-10-22
scrape = pd.Timestamp("2020-10-22 23:59")

# create new col for age of post
df_insta["post_age"] = scrape - df_insta["date_time"]
df_insta.shape # verify new col added

(14488, 23)

In [47]:
# create new col for no. of hashtags used
df_insta["hash_count"] = [len(h) for h in df_insta["hashtags"].str.extractall("(\#)").groupby(level=0)[0].apply(lambda x: "".join(x))]
df_insta.shape # verify new col added

ValueError: Length of values does not match length of index

In [50]:
(df_insta["hashtags"].str.extractall("(\#)").groupby(level=0)[0].apply(lambda x: "".join(x)))

0

In [51]:
len("abc def g")

9

In [None]:
sns.scatterplot()

In [3]:
df_insta = pd.read_csv("../datasets/df_insta_2020-10-12.csv", parse_dates=["date_time"])
df_insta.head(3)

Unnamed: 0,post_id,post_slug,unix_time,date_time,post_caption,hashtags,topic_tags,is_video,is_ad,post_likes,geo_tag,geo_slug,owner_id,owner_verified,owner_privacy,owner_unpublished,owner_total_posts,owner_total_followers
0,2415039914432249111,CGD8gG6H70X,1602115214,2020-10-08 00:00:14,We may be stressed and tied up during the week...,"['#selfcaresaturday', '#saturday', '#skincare'...","['one or more people', ""text that says 'Charge...",False,False,2,,,3986460102,False,False,False,169,263
1,2403781281091523733,CFb8lbhMZSV,1600773080,2020-09-22 11:11:20,"Wanted mooncakes, but found something else... ...","['#oldman', '#lunchtime', '#streetphotography'...",,False,False,26,Chinatown,chinatown,32884185,False,False,False,512,269
2,2412436218553090968,CF6sfWKJMOY,1601804829,2020-10-04 09:47:09,"Me the local tourist, visited Pulau Ubin, Part...","['#singaporediscovers', '#pulauubin', '#bumboa...",,False,False,15,"Pulau Ubin, Singapore",pulau-ubin-singapore,1477803308,False,False,False,1221,377


In [4]:
df_insta.shape

(3104, 18)

In [5]:
df_insta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3104 entries, 0 to 3103
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   post_id                3104 non-null   int64         
 1   post_slug              3104 non-null   object        
 2   unix_time              3104 non-null   int64         
 3   date_time              3104 non-null   datetime64[ns]
 4   post_caption           3083 non-null   object        
 5   hashtags               3083 non-null   object        
 6   topic_tags             2037 non-null   object        
 7   is_video               3104 non-null   bool          
 8   is_ad                  3104 non-null   bool          
 9   post_likes             3104 non-null   int64         
 10  geo_tag                2383 non-null   object        
 11  geo_slug               2378 non-null   object        
 12  owner_id               3104 non-null   int64         
 13  own

In [24]:
df_insta["hashtags"].isnull().sum()

21

In [15]:
check = []
for hashes in df_insta["hashtags"].str.lower():
    try:
        if "#singaporediscovers" in hashes:
            check.append(True)
    except:
        check.append(False)

In [16]:
pd.Series(check).value_counts()

True     2604
False      21
dtype: int64

In [29]:
df_insta["hashtags"][0]

"['#selfcaresaturday', '#saturday', '#skincare', '#saturdaymood', '#saturdaycare', '#saturdayfun', '#beautysg', '#sgfinds', '#selfcare', '#coslabbeautyspa', '#teamcoslab', '#beautyundercoversg', '#sgpromo', '#sgdeals', '#sgbeauty', '#supportlocalsg', '#SingapoRediscovers', '#shoponline', '#freedelivery', '#coslab1010', '#1010sale', '#savethedate']"

In [56]:
mask = df_insta["hashtags"].str.lower().str.contains("#singaporediscovers|singaporediscover").astype("bool")
mask.value_counts()

True     2625
False     479
Name: hashtags, dtype: int64

In [57]:
df_insta[~mask][["hashtags", "post_caption"]]

Unnamed: 0,hashtags,post_caption
9,[],A Heart Warming Mid-Autumn 2020\n牛车水庆中秋2020\nC...
19,['#misterzimi'],a few times i came to @raffleshotelsingapore f...
22,[],Me: 📸📸📸\n\nThe uncles trying to do their laund...
33,[],"Showers & Rainbow\nTiong Bahru, Singapore\n5.5..."
34,[],Directions\n\nCOVID-19 took the world by its a...
...,...,...
3083,"['#magicboxsg', '#magixers']",We are the only one-stop online gifting platfo...
3088,[],I could get used to spending my weekend aftern...
3089,[],Looks safe. What could possibly go wrong here?...
3092,[],Just wanna chill at the pool and lay there soa...
