In [1]:
import json

import pandas as pd
from datetime import datetime, timedelta

import trane
import featuretools as ft
import numpy as np



In [2]:
df = pd.read_csv("USvideos.csv")
df["trending_date"] = df["trending_date"].apply(
    lambda x: datetime.strptime(x, "%y.%d.%m")
)
df = df.sort_values(by=["trending_date"])

channel_to_id = {}
id_to_channel = []
n_channel = 0
for cc in set(df["channel_title"]):
    channel_to_id[cc] = n_channel
    id_to_channel.append(cc)
    n_channel += 1
df["channel_title"] = df["channel_title"].apply(lambda x: channel_to_id[x])

meta = trane.TableMeta(json.loads(open("meta.json").read()))

# entity_col = "__fake_root_entity__"
# df, meta = trane.overall_prediction_helper(df, meta)
entity_col = "channel_title"

df.head()

Unnamed: 0,trending_date,channel_title,category_id,views,likes,dislikes,comment_count
0,2017-11-14,1953,22,748374,57527,2966,15954
127,2017-11-14,885,10,98422,2926,106,798
128,2017-11-14,1872,15,426078,19323,245,945
129,2017-11-14,686,10,33315,1365,24,91
130,2017-11-14,981,10,1231518,32648,6221,9288


In [3]:
cutoff_base = datetime.strptime("2017-12-01", "%Y-%m-%d")
cutoff_end = datetime.strptime("2018-06-01", "%Y-%m-%d")
cutoff_strategy = trane.FixWindowCutoffStrategy(entity_col, cutoff_base, cutoff_end, 28)

features = trane.FeaturetoolsWrapper(
    df,
    entity_col,
    "trending_date",
    {"category_id": ft.variable_types.Categorical},
    "youtube",
)
features.compute_features(df, cutoff_strategy, 28)

Built 35 features
Elapsed: 00:08 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 12/12 chunks


In [4]:
problem_generator = trane.PredictionProblemGenerator(
    table_meta=meta, time_col="trending_date", entity_col=entity_col
)

problems = problem_generator.generate()

Success/Attempt = 0/0Success/Attempt = 1/1Success/Attempt = 1/2Success/Attempt = 1/3Success/Attempt = 2/4Success/Attempt = 3/5Success/Attempt = 4/6Success/Attempt = 5/7Success/Attempt = 5/8Success/Attempt = 6/9Success/Attempt = 6/10Success/Attempt = 6/11Success/Attempt = 6/12Success/Attempt = 6/13Success/Attempt = 6/14Success/Attempt = 7/15Success/Attempt = 7/16Success/Attempt = 7/17Success/Attempt = 7/18Success/Attempt = 7/19Success/Attempt = 7/20Success/Attempt = 7/21Success/Attempt = 8/22Success/Attempt = 9/23Success/Attempt = 10/24Success/Attempt = 11/25Success/Attempt = 11/26Success/Attempt = 11/27Success/Attempt = 12/28Success/Attempt = 13/29Success/Attempt = 14/30Success/Attempt = 15/31Success/Attempt = 15/32Success/Attempt = 15/33Success/Attempt = 15/34Success/Attempt = 15/35Success/Attempt = 15/36Success/Attempt = 15/37Success/Attempt = 15/38Success/Attempt = 15/39Success/Attempt = 15/40Success/Attempt = 15/41Success/Attempt = 15/42S

Success/Attempt = 198/775


In [None]:
evaluator = trane.PredictionProblemEvaluator(
    df, entity_col=entity_col, cutoff_strategy=cutoff_strategy
)

In [None]:
result = trane.multi_process_evaluation(evaluator, problems, features)
with open(entity_col + "_result.json", "w") as f:
    json.dump(result, f)

  0%|          | 0/198 [00:00<?, ?it/s]

In [None]:
for _id, x in enumerate(problems):
    print(_id, str(x))
    evaluator.evaluate(x, features)

0 For each <channel_title> predict the number of records


In [9]:
Y = [0, 1]

In [10]:
from sklearn.svm import LinearSVC

In [15]:
svm = LinearSVC(dual=False, max_iter=100)

In [16]:
svm.fit(X, Y)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [17]:
svm.predict(X)

array([0, 1])

In [21]:
evaluator.classifier[3]["model"].fit(X, Y)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)