In [67]:
import pandas as pd
import numpy as np
import re

In [None]:
from sklearn.preprocessing import OneHotEncoder

## Problem Statement

We're going to try to predict something about tiktok users from a dataset of the top 250 users.

## Read in data from GitHub

In [69]:
tiktok_path = "https://raw.githubusercontent.com/datares/TikTok_Famous/main/Datasets/Top%20Tiktokers%20Data%20Collection/top-250-tiktokers.csv"
tiktok_raw = pd.read_csv(tiktok_path)

In [70]:
tiktok_raw
tiktok = tiktok_raw.copy()

## Data Cleaning

In [71]:
match = '\d*[.]?\d*'

tiktok['Followers'] = tiktok_raw['Followers'].apply(lambda x: float(re.findall(match, x)[0]))
tiktok['Views'] = tiktok_raw['Views'].apply(lambda x: float(re.findall(match, x)[0]))
tiktok['Likes'] = tiktok_raw['Likes'].apply(lambda x: float(re.findall(match, x)[0]))
tiktok['Engagement'] = tiktok_raw['Engagement'].apply(lambda x: float(re.findall('\d*[.]?\d*', x)[0]))

## Data Exploration

In [60]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           256 non-null    int64  
 1   Username       256 non-null    object 
 2   Country        256 non-null    object 
 3   Followers      256 non-null    float64
 4   Views          256 non-null    float64
 5   Likes          256 non-null    float64
 6   Engagement     256 non-null    float64
 7   Brand Account  256 non-null    int64  
 8   Gender         232 non-null    object 
 9   Age            225 non-null    float64
 10  Ethnicity      242 non-null    object 
 11  Famous         255 non-null    float64
 12  Genre          254 non-null    object 
 13  LGBTQ          239 non-null    float64
dtypes: float64(7), int64(2), object(5)
memory usage: 28.1+ KB


## Model Fitting

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

In [None]:
# def print_metrics(model, X_train, X_test, y_train, y_test):
# #      training and testing accuracy, F1 score, and AUC
#     y_train_hat = model.predict(X_train)
#     y_test_hat = model.predict(X_test)
    
#     train_accuracy = metrics.accuracy_score(y_train, y_train_hat)
#     test_accuracy = metrics.accuracy_score(y_test, y_test_hat)
#     f1_score = metrics.f1_score(y_test, y_test_hat)
#     auc_score = metrics.roc_auc_score(y_test, y_test_hat)
    
#     print("\ttrain accuracy: ", train_accuracy)
#     print("\ttest accuracy:", test_accuracy)
#     print("\tf1 score:", str(f1_score))
#     print("\tAUC", auc_score)

def fit_models(data):

    models = {LogisticRegression: {},
              RandomForestClassifier: {'min_samples_leaf': 3},
              KNeighborsClassifier: {}}
    
    classifiers = {}
    
    for model, parameters in models.items():
        pipe = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore')),
                         ('scaler', StandardScaler(with_mean=False)), 
                         ('model', model(**parameters))])
        
        cv = GridSearchCV(pipe, parameters)
        name = model.__name__
        classifiers[name] = cv.fit(X_train, y_train)
        print(name, "(" + ", ".join(f'{k}: {v}' for k,v in model.items()) + ")")
#         print_metrics(classifiers[name], X_train, X_test, y_train, y_test)
    return classifiers

## Metrics/Evaluation