# Environment setting and package import

In [1]:
import json
import numpy as np
import pandas as pd

from datetime import datetime
from collections import Counter

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
sns.set_style('whitegrid')
palette = sns.color_palette('coolwarm')

# Tiktok posts dataset preprocessing

## Posts extraction

Posts are extracted using TikTok hashtag analysis toolset developped by bellingcat

https://github.com/bellingcat/tiktok-hashtag-analysis?tab=readme-ov-file#tiktok-hashtag-analysis-toolset

In [None]:
# pip install tiktok-hashtag-analysis
# python -m playwright install
# terminal command: tiktok-hashtag-analysis personalcoloranalysis

In [3]:
with open('posts.json', 'r') as file:
    data = json.load(file)

In [55]:
# number of posts
len(data)

474

In [5]:
# preview
data[1]

{'BAInfo': '',
 'adAuthorization': False,
 'adLabelVersion': 0,
 'aigcLabelType': 0,
 'author': {'avatarLarger': 'https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/7310710960996483118~c5_1080x1080.jpeg?lk3s=a5d48078&x-expires=1713178800&x-signature=omCo2%2BVJVIRzPFySiaQ96FX5nMc%3D',
  'avatarMedium': 'https://p77-sign-va.tiktokcdn.com/tos-maliva-avt-0068/7310710960996483118~c5_720x720.jpeg?lk3s=a5d48078&x-expires=1713178800&x-signature=mz4du%2BPqkniQn%2FRWQhL7yyeCKlA%3D',
  'avatarThumb': 'https://p77-sign-va.tiktokcdn.com/tos-maliva-avt-0068/7310710960996483118~c5_100x100.jpeg?lk3s=a5d48078&x-expires=1713178800&x-signature=yLbkMhWH1FGuTsoWj7aHiUTDnT4%3D',
  'commentSetting': 0,
  'downloadSetting': 0,
  'duetSetting': 0,
  'ftc': False,
  'id': '6734083000290722821',
  'isADVirtual': False,
  'isEmbedBanned': False,
  'nickname': 'marissa ren',
  'openFavorite': False,
  'privateAccount': False,
  'relation': 0,
  'secUid': 'MS4wLjABAAAAS7wgMAQ1Q91-SqvywvB65aOP9j56-8BMme7pN_qn3a2R

## Data cleaning

In [7]:
# define function to extract all hashtags in one post
def extract_hashtags(record):
    hashtags = []
    for content in record['contents']:
        for text_extra in content['textExtra']:
            hashtags.append(text_extra['hashtagName'])
    return hashtags

In [8]:
# define function to extract location of a post because most of the posts do not have 'poi'
def extract_location(record):
    location = []
    if 'poi' in record:
        poi = record['poi']
        name = poi.get('name')
        if name:
            location.append(name)
    return location

In [9]:
# create dataframe that stores necessary attributes

post_list = []

for record in data:
    id = record['id']
    user_name = record['author']['nickname']
    author = record['authorStats']
    author_follower = author['followerCount']
    author_heart = author['heart']
    desc = record['desc']
    challenge = record['challenges']
    cha_desc = challenge[0]['desc']
    stats = record['statsV2']
    comments = stats['commentCount']
    digg = stats['diggCount']
    view = stats['playCount']
    share = stats['shareCount']
    collect = stats['collectCount']
    posttime = record['createTime']
    all_hashtags = extract_hashtags(record)
    location = extract_location(record)

    post_list.append(
        {'id':id,
         'user_name':user_name,
        'author_follower':author_follower,
        'author_heart':author_heart,
        'posttime':posttime,
        'location':location,
        'view':view,
        'collect':collect,
        'share':share,
        'digg':digg,
        'comments':comments,
        'desc':desc,
        'cha_desc':cha_desc,
        'all_hashtags':all_hashtags}
    )

df = pd.DataFrame(post_list)


In [10]:
# preview of posts dataframe

df.head(20)

Unnamed: 0,id,user_name,author_follower,author_heart,posttime,location,view,collect,share,digg,comments,desc,cha_desc,all_hashtags
0,7324359490361412869,Carol Brailey|Color Analysis🎨,323700,6800000,1705335339,[],17444557,177617,11690,2207386,8115,This client’s colour analysis update raises so...,,"[carolbrailey, colouranalysis, coloranalysis, ..."
1,7309534985084521759,marissa ren,2400000,87600000,1701883746,[Seoul],15775359,130404,9207,1905913,3221,very fun but humbling experience lol #personal...,,"[personalcoloranalysis, coloranalysis, korea]"
2,7190844699408698667,Rosa,25900,2100000,1674249012,[],9174808,174663,54000,1483705,3330,me the entire video🤨 #fyp #korea #personalcolo...,,"[fyp, korea, personalcoloranalysis, southkorea..."
3,7130674894198623534,khadija 🍉,4076,1900000,1660239629,[],12565693,36630,1075,1469326,1008,When I realized I bought two blue tops the day...,,"[personalcolor, colortest, personalcoloranalys..."
4,7164035101205040430,shu ☁️,9355,1700000,1668006917,[],13440706,42784,3235,1202012,1387,I'm still in denial 🥲 #personalcoloranalysis #...,,"[personalcoloranalysis, personalcolor, persona..."
5,7228231085077908754,Molita Lin,64800,2900000,1682953700,[],9304393,135619,12573,1014673,1721,Finally i tried the personal color test & body...,,"[personalcolor, personalcoloranalysis, bodytyp..."
6,7064232817164602630,Julia Dobkine|Agile Styling,202800,5100000,1644769874,[],7584594,15672,824,881272,945,#personalcolor #personalcoloranalysis #colourc...,,"[personalcolor, personalcoloranalysis, colourc..."
7,7351867077415865646,Fashion Finds and Fits,22500,2000000,1711739975,[],18499755,276554,32432,742171,36275,Use the app WhatColors to find all of this out...,Try our new #ColorAnalysis effect and share yo...,"[coloranalysis, personalcolor, personalcoloran..."
8,7329912922581257504,★,11900,1900000,1706628352,[],6197404,185526,3816,712768,14447,#fyp #foryou #colorpalette #fypシ #undertone #w...,,"[fyp, foryou, colorpalette, fypシ, undertone, w..."
9,7130656073572306222,nvemaa,5784,1700000,1660235247,[],6887743,10044,314,683663,975,ignore my rbf #personalcolor #personalcolorana...,,"[personalcolor, personalcoloranalysis, persona..."


In [11]:
# check data types

df.dtypes

id                 object
user_name          object
author_follower     int64
author_heart        int64
posttime            int64
location           object
view               object
collect            object
share              object
digg               object
comments           object
desc               object
cha_desc           object
all_hashtags       object
dtype: object


In [12]:
# transform datetime

df['post_date']=df['posttime'].apply(lambda x: datetime.fromtimestamp(x))
df.pop('posttime')

0      1705335339
1      1701883746
2      1674249012
3      1660239629
4      1668006917
          ...    
469    1667536565
470    1710332080
471    1712548430
472    1675692635
473    1706026523
Name: posttime, Length: 474, dtype: int64

In [57]:
# transform objects

int_list = ['id','view','share','comments','digg','collect']
str_list = ['user_name','desc','cha_desc'] 

for attr in int_list:
    df[attr] = df[attr].astype(int)

for re in str_list:
    df[re] = df[re].astype('string')

df.dtypes

id                          int64
user_name                  string
author_follower             int64
author_heart                int64
location                   object
view                        int64
collect                     int64
share                       int64
digg                        int64
comments                    int64
desc                       string
cha_desc                   string
all_hashtags               object
post_date          datetime64[ns]
dtype: object

In [58]:
# sorted_df = df.sort_values(by='view', ascending=False)
# sorted_df

In [59]:
# sorted_date_df = df.sort_values(by='post_date')
# sorted_date_df[sorted_date_df['cha_desc'].str.contains('new')]

In [66]:
df.to_csv('posts_df.csv')

# EDA

In [29]:
# count posts per year

plot_df_count = df.groupby(df['post_date'].dt.year).agg(
    post_count=('post_date', 'size'),
    view_count=('view', 'sum'),
    collect_count=('collect','sum'),
    comments_count=('comments', 'sum'),
    share_count=('share', 'sum'),
    digg_count=('digg', 'sum')
).reset_index()

plot_df_count.rename(columns={'post_date': 'year'}, inplace=True)

In [70]:
plot_df_count

Unnamed: 0,year,post_count,view_count,collect_count,comments_count,share_count,digg_count
0,2020,2,1228445,3854,596,1158,123455
1,2021,4,2279523,5181,892,2185,152666
2,2022,94,106680878,438285,49875,40700,9225472
3,2023,289,233222758,2518254,60047,290392,17410302
4,2024,85,97042277,1233486,95358,94920,6545666


In [88]:
df['year'] = df['post_date'].dt.year
df['month'] = df['post_date'].dt.month
df['y_m'] = df['post_date'].dt.strftime('%Y-%m')
df.head(5)

Unnamed: 0,id,user_name,author_follower,author_heart,location,view,collect,share,digg,comments,desc,cha_desc,all_hashtags,post_date,year,month,year_month,y_m
0,7324359490361412869,Carol Brailey|Color Analysis🎨,323700,6800000,[],17444557,177617,11690,2207386,8115,This client’s colour analysis update raises so...,,"[carolbrailey, colouranalysis, coloranalysis, ...",2024-01-15 17:15:39,2024,1,0 2024\n1 2023\n2 2023\n3 ...,2024-01
1,7309534985084521759,marissa ren,2400000,87600000,[Seoul],15775359,130404,9207,1905913,3221,very fun but humbling experience lol #personal...,,"[personalcoloranalysis, coloranalysis, korea]",2023-12-06 18:29:06,2023,12,0 2024\n1 2023\n2 2023\n3 ...,2023-12
2,7190844699408698667,Rosa,25900,2100000,[],9174808,174663,54000,1483705,3330,me the entire video🤨 #fyp #korea #personalcolo...,,"[fyp, korea, personalcoloranalysis, southkorea...",2023-01-20 22:10:12,2023,1,0 2024\n1 2023\n2 2023\n3 ...,2023-01
3,7130674894198623534,khadija 🍉,4076,1900000,[],12565693,36630,1075,1469326,1008,When I realized I bought two blue tops the day...,,"[personalcolor, colortest, personalcoloranalys...",2022-08-11 19:40:29,2022,8,0 2024\n1 2023\n2 2023\n3 ...,2022-08
4,7164035101205040430,shu ☁️,9355,1700000,[],13440706,42784,3235,1202012,1387,I'm still in denial 🥲 #personalcoloranalysis #...,,"[personalcoloranalysis, personalcolor, persona...",2022-11-09 16:15:17,2022,11,0 2024\n1 2023\n2 2023\n3 ...,2022-11


In [90]:
# count posts per month

plot_df_count_2 = df.groupby('y_m').agg(
    post_count=('id', 'size'),
    view_count=('view', 'sum'),
    collect_count=('collect', 'sum'),
    comments_count=('comments', 'sum'),
    share_count=('share', 'sum'),
    digg_count=('digg', 'sum')
).reset_index()

plot_df_count_2

Unnamed: 0,y_m,post_count,view_count,collect_count,comments_count,share_count,digg_count
0,2020-10,1,1017748,3686,499,1101,107863
1,2020-12,1,210697,168,97,57,15592
2,2021-04,1,294783,2440,137,684,42215
3,2021-06,1,1167933,1666,463,299,75014
4,2021-08,2,816807,1075,292,1202,35437
5,2022-01,2,1712240,6053,229,331,104584
6,2022-02,7,10823206,23839,1647,2300,1055469
7,2022-03,4,3568003,2330,261,302,181223
8,2022-04,7,8610464,86399,3830,11529,826298
9,2022-05,11,11952526,36687,20797,1793,1022883


In [60]:
# co-ocurring hashtags summary

allhashtags = []

for hashtags_list in df['all_hashtags']:
    allhashtags.extend(hashtags_list)

hashtags_counter = Counter(allhashtags)
hashtags_dict = dict(hashtags_counter)

hashtags_freq = pd.DataFrame(list(hashtags_dict.items()), columns=['Hashtag', 'Frequency'])
hashtags_freq = hashtags_freq.sort_values(by='Frequency', ascending=False)
hashtags_freq.head(20)

Unnamed: 0,Hashtag,Frequency
9,personalcoloranalysis,479
22,personalcolor,268
2,coloranalysis,165
21,,130
30,personalcolortest,81
4,coloranalysistok,68
5,undertone,64
16,fyp,61
26,kbeauty,60
1,colouranalysis,58
