# TikTok (Python)

In [6]:
# Libraries
import numpy as np
import pandas as pd
import re
from skimpy import clean_columns

Course 2 tasks:

- Build a dataframe for the TikTok dataset
- Read in data from TikTok csv file
- Display rows within dataframe
- Examine data type of each column
- Gather descriptive statistics
- Visualize the TikTok data in Python
- Report to TikTokâ€™s data team through an executive summary

In [2]:
df = pd.read_csv(
    r"data/tiktok_dataset.csv",
    header=0,
    skiprows=None
    )

In [3]:
# Check df dimensions
expected_shape = (19382, 12)
[expected_shape[i] == df.shape[i] for i in range(len(expected_shape))]

[True, True]

In [4]:
# Inspect the first 10 rows
df.head(10)

Unnamed: 0,#,claim_status,video_id,video_duration_sec,video_transcription_text,verified_status,author_ban_status,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
0,1,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
1,2,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
2,3,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
3,4,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
4,5,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0
5,6,claim,8972200955,35,someone shared with me that gross domestic pro...,not verified,under review,336647.0,175546.0,62303.0,4293.0,1857.0
6,7,claim,4958886992,16,someone shared with me that elvis presley has ...,not verified,active,750345.0,486192.0,193911.0,8616.0,5446.0
7,8,claim,2270982263,41,someone shared with me that the best selling s...,not verified,active,547532.0,1072.0,50.0,22.0,11.0
8,9,claim,5235769692,50,someone shared with me that about half of the ...,not verified,active,24819.0,10160.0,1050.0,53.0,27.0
9,10,claim,4660861094,45,someone shared with me that it would take a 50...,verified,active,931587.0,171051.0,67739.0,4104.0,2540.0


In [None]:
# Examine the structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19382 entries, 0 to 19381
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   #                         19382 non-null  int64  
 1   claim_status              19084 non-null  object 
 2   video_id                  19382 non-null  int64  
 3   video_duration_sec        19382 non-null  int64  
 4   video_transcription_text  19084 non-null  object 
 5   verified_status           19382 non-null  object 
 6   author_ban_status         19382 non-null  object 
 7   video_view_count          19084 non-null  float64
 8   video_like_count          19084 non-null  float64
 9   video_share_count         19084 non-null  float64
 10  video_download_count      19084 non-null  float64
 11  video_comment_count       19084 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 1.8+ MB


In [8]:
# Clean the cols
df = clean_columns(df)
df.columns

Index(['#', 'claim_status', 'video_id', 'video_duration_sec',
       'video_transcription_text', 'verified_status', 'author_ban_status',
       'video_view_count', 'video_like_count', 'video_share_count',
       'video_download_count', 'video_comment_count'],
      dtype='object')

In [12]:
# Fix data types
dtype_cols = [c for c in df.columns if "_count" in c]
# df[dtype_cols].astype("int32")

In [15]:
# Inspect NaNs
[df[c].isnull().sum() for c in dtype_cols]

[np.int64(298), np.int64(298), np.int64(298), np.int64(298), np.int64(298)]

In [13]:
# Inspect the summary stats
df.describe()

Unnamed: 0,#,video_id,video_duration_sec,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
count,19382.0,19382.0,19382.0,19084.0,19084.0,19084.0,19084.0,19084.0
mean,9691.5,5627454000.0,32.421732,254708.558688,84304.63603,16735.248323,1049.429627,349.312146
std,5595.245794,2536440000.0,16.229967,322893.280814,133420.546814,32036.17435,2004.299894,799.638865
min,1.0,1234959000.0,5.0,20.0,0.0,0.0,0.0,0.0
25%,4846.25,3430417000.0,18.0,4942.5,810.75,115.0,7.0,1.0
50%,9691.5,5618664000.0,32.0,9954.5,3403.5,717.0,46.0,9.0
75%,14536.75,7843960000.0,47.0,504327.0,125020.0,18222.0,1156.25,292.0
max,19382.0,9999873000.0,60.0,999817.0,657830.0,256130.0,14994.0,9599.0


In [16]:
# What are the different values for claim status and how many of each are in the data?
df["claim_status"].value_counts()

claim_status
claim      9608
opinion    9476
Name: count, dtype: int64

In [18]:
# What is the average view count of videos with each status?
df.groupby("claim_status")["video_view_count"].mean()

claim_status
claim      501029.452748
opinion      4956.432250
Name: video_view_count, dtype: float64

In [19]:
# What is the average view count of videos with "opinion" status?
df.groupby("claim_status")["video_view_count"].mean()["opinion"]

np.float64(4956.43224989447)

In [22]:
# Get counts for each group combination of claim status and author ban status
df.groupby(["claim_status", "author_ban_status"])["video_id"].count()

claim_status  author_ban_status
claim         active               6566
              banned               1439
              under review         1603
opinion       active               8817
              banned                196
              under review          463
Name: video_id, dtype: int64

In [23]:
# What's the median video share count of each author ban status?
df.groupby("author_ban_status")["video_share_count"].median()

author_ban_status
active            437.0
banned          14468.0
under review     9444.0
Name: video_share_count, dtype: float64

In [None]:
# Count, mean median by ban status
df.groupby("author_ban_status")[["video_view_count", "video_like_count", "video_share_count"]].agg(["count", "mean", "median"])

Unnamed: 0_level_0,video_view_count,video_view_count,video_view_count,video_like_count,video_like_count,video_like_count,video_share_count,video_share_count,video_share_count
Unnamed: 0_level_1,count,mean,median,count,mean,median,count,mean,median
author_ban_status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
active,15383,215927.039524,8616.0,15383,71036.533836,2222.0,15383,14111.466164,437.0
banned,1635,445845.439144,448201.0,1635,153017.236697,105573.0,1635,29998.942508,14468.0
under review,2066,392204.836399,365245.5,2066,128718.050339,71204.5,2066,25774.696999,9444.0


In [None]:
# Create a likes/comments/shares_per_view column
ratio_cols = ["likes_per_view", "comments_per_view", "shares_per_view"]

for col in ratio_cols:
    key_term = re.search("^[a-z]+(?=s)", col).group()
    source_col = f"video_{key_term}_count"

    df[col] = df[source_col] / df["video_view_count"]

df[ratio_cols].head(n=15)

Unnamed: 0,likes_per_view,comments_per_view,shares_per_view
0,0.056584,0.0,0.000702
1,0.549096,0.004855,0.135111
2,0.108282,0.000365,0.003168
3,0.548459,0.001335,0.079569
4,0.62291,0.002706,0.073175
5,0.521454,0.005516,0.185069
6,0.647958,0.007258,0.258429
7,0.001958,2e-05,9.1e-05
8,0.409364,0.001088,0.042306
9,0.183612,0.002727,0.072714


In [34]:
# Summary stats for the new cols
df.groupby(["claim_status", "author_ban_status"])[ratio_cols].agg(["count", "mean", "median"])

Unnamed: 0_level_0,Unnamed: 1_level_0,likes_per_view,likes_per_view,likes_per_view,comments_per_view,comments_per_view,comments_per_view,shares_per_view,shares_per_view,shares_per_view
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,median,count,mean,median,count,mean,median
claim_status,author_ban_status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
claim,active,6566,0.329542,0.326538,6566,0.001393,0.000776,6566,0.065456,0.049279
claim,banned,1439,0.345071,0.358909,1439,0.001377,0.000746,1439,0.067893,0.051606
claim,under review,1603,0.327997,0.320867,1603,0.001367,0.000789,1603,0.065733,0.049967
opinion,active,8817,0.219744,0.21833,8817,0.000517,0.000252,8817,0.043729,0.032405
opinion,banned,196,0.206868,0.198483,196,0.000434,0.000193,196,0.040531,0.030728
opinion,under review,463,0.226394,0.228051,463,0.000536,0.000293,463,0.044472,0.035027
