In [297]:
import pandas as pd
import numpy as np
import re

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [163]:
from sklearn.preprocessing import OneHotEncoder

## Problem Statement

We're going to try to predict something about tiktok users from a dataset of the top 250 users.

Because our dataset is small, we're going to use cross validation to identify the best combination of parameters for each model.

## Read in data from GitHub

In [199]:
tiktok_path = "https://raw.githubusercontent.com/datares/TikTok_Famous/main/Datasets/Top%20Tiktokers%20Data%20Collection/top-250-tiktokers.csv"
tiktok_raw = pd.read_csv(tiktok_path).drop(columns = ['Rank','Brand Account','Views','LGBTQ']).set_index('Username')
tiktok = tiktok_raw.copy()

## Data Cleaning

In [241]:
genres = tiktok_raw.Genre.str.split(', ').to_list()
unique_genres = list(set([item.strip() for sublist in genres if sublist is not np.nan for item in sublist]))
for genre in unique_genres:
    tiktok[genre] = tiktok_raw.Genre.fillna('').str.contains(genre)
tiktok = tiktok.drop(columns=['Genre'])

In [242]:
match = '\d*[.]?\d*'

tiktok['Followers'] = tiktok_raw['Followers'].apply(lambda x: float(re.findall(match, x)[0]))
tiktok['Likes'] = tiktok_raw['Likes'].apply(lambda x: float(re.findall(match, x)[0]))
tiktok['Engagement'] = tiktok_raw['Engagement'].apply(lambda x: float(re.findall('\d*[.]?\d*', x)[0]))

In [271]:
tiktok['Famous'] = tiktok['Famous'].fillna(0)
tiktok = tiktok.dropna()

In [255]:
categorical_vars = ['Country', 'Gender', 'Ethnicity']
tiktok = tiktok.join(pd.get_dummies(tiktok[categorical_vars])).drop(columns=categorical_vars)

## Data Exploration

In [202]:
tiktok.info()

<class 'pandas.core.frame.DataFrame'>
Index: 256 entries, @charlidamelio to @sarati
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                256 non-null    object 
 1   Followers              256 non-null    float64
 2   Likes                  256 non-null    float64
 3   Engagement             256 non-null    float64
 4   Gender                 232 non-null    object 
 5   Age                    225 non-null    float64
 6   Ethnicity              242 non-null    object 
 7   Famous                 255 non-null    float64
 8   Genre                  254 non-null    object 
 9   Reviews                256 non-null    bool   
 10  Acting                 256 non-null    bool   
 11  Dancing                256 non-null    bool   
 12  Motivational Speaking  256 non-null    bool   
 13  Content House          256 non-null    bool   
 14  Singing                256 non-null    bool   

## Feature, Target Split

In [273]:
X = tiktok.drop(columns=['Engagement'])
y = tiktok[['Engagement']]

## Model Fitting

In [299]:
from sklearn.linear_model import LinearRegression, ElasticNet, Perceptron
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [319]:
def fit_models(X, y):
    overall = {'poly__degree': [1,2], 'poly__interaction_only': [True, False]}
    models = {LinearRegression: {'model__positive': [True, False]},
              ElasticNet: {},
              RandomForestRegressor: {},
              KNeighborsRegressor: {}}

    for model, params in models.items():
        pipe  = Pipeline([('poly', PolynomialFeatures(interaction_only=True)),
                          ('scaler', StandardScaler()), 
                          ('model', model())])
        params.update(overall)
        cv = GridSearchCV(pipe, params, scoring='neg_mean_squared_error')
        cv.fit(X, y)
        print(model.__name__, ':', cv.best_params_)
        print('\tMSE:', np.abs(cv.best_score_))

In [320]:
fit_models(X, y)

LinearRegression : {'model__positive': True, 'poly__degree': 1, 'poly__interaction_only': True}
	MSE: 27.417742912290826
ElasticNet : {'poly__degree': 1, 'poly__interaction_only': True}
	MSE: 19.769952707394346
RandomForestRegressor : {'poly__degree': 2, 'poly__interaction_only': True}
	MSE: 20.160365902222228
KNeighborsRegressor : {'poly__degree': 1, 'poly__interaction_only': True}
	MSE: 23.07973511111111


## Metrics/Evaluation