In [None]:
import pandas as pd
import numpy as np
import re as re
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

In [None]:
X_train = pd.read_csv('producthunt_train.csv')
X_train.drop(columns=['id'], inplace=True)

X_test = pd.read_csv('producthunt_test.csv')
X_test.drop(columns=['id'], inplace=True)

X_train, y_train = X_train.drop(columns=['votesCount']), X_train['votesCount']

### Data Preprocess

#### Handle slug and name

In [None]:
def search_words(text):
    if not isinstance(text, str):
        return str(text)
    result = re.findall(r'\b[^\d\W]+\b', text)
    return " ".join(result)

In [None]:
for X in [X_train, X_test]:
    X['slug'] = X['slug'].apply(lambda x : search_words(x))
    X['name'] = X['name'].apply(lambda x : search_words(x))

    X['slug'] = X['slug'].str.replace(' ', '')
    X['name'] = X['name'].str.replace(' ', '')

    X['slug'] = X['slug'].str.lower()
    X['name'] = X['name'].str.lower()

See if slug is same as name (almost)

In [None]:
for X in [X_train, X_test]:
    X['slug_is_name'] = X['slug'] == X['name']

In [None]:
for X in [X_train, X_test]:
    X['slug_len'] = X['slug'].apply(lambda x: len(x))
    X['name_len'] = X['name'].apply(lambda x: len(x))

#### Handling topics

Getting lists of topics in column

In [None]:
for X in [X_train, X_test]:
    X['topics'] = X['topics'].apply(
        lambda x:
        set(x.split(', ')) if isinstance(x, str)
        else set(str(x))
    )

Getting set of all topics

In [None]:
all_topics = set()
for X in [X_train, X_test]:
    for topics in X['topics']:
        for topic in topics:
            all_topics.add(topic)

One hot encode topics and get topics count

In [None]:
for X in [X_train, X_test]:
    for topic in all_topics:
        X[topic] = False

    X['n_topics'] = 0
    for index, row in X.iterrows():
        X.at[index, 'n_topics'] = len(row['topics'])
        for topic in row['topics']:
            X.at[index, topic] = True

Drop topics

In [None]:
for X in [X_train, X_test]:
    X.drop(columns=['topics'], inplace=True)

#### Handling date time

In [None]:
for X in [X_train, X_test]:
    X['datetime'] = X['dateAdded'] + ' ' + X['timeAdded']
    X.drop(columns=['dateAdded', 'timeAdded'], inplace=True)
    X['datetime'] = pd.to_datetime(X['datetime'], format='%Y-%m-%d %H:%M:%S')

In [None]:
assign_dict_time = {
    'year': lambda x: x.datetime.dt.year,
    'month': lambda x: x.datetime.dt.month,
    'day_of_week': lambda x: x.datetime.dt.day_of_week,
    'time': lambda x: x.datetime.dt.hour
}

In [None]:
X_train = X_train.assign(**assign_dict_time)
X_test = X_test.assign(**assign_dict_time)

Get seasons

In [None]:
for X in [X_train, X_test]:
    X['season'] = 'winter'
    X.loc[(8<=X['datetime'].dt.month) & (X['datetime'].dt.month<=10), 'season'] = 'fall'
    X.loc[(2<=X['datetime'].dt.month) & (X['datetime'].dt.month<=4), 'season'] = 'spring'
    X.loc[(5<=X['datetime'].dt.month) & (X['datetime'].dt.month<=7), 'season'] = 'summer'

Get weekend

In [None]:
for X in [X_train, X_test]:
    X['weekend'] = False
    X.loc[X['datetime'].dt.day_of_week > 4, 'weekend'] = True

Get daypart

In [None]:
for X in [X_train, X_test]:
    X['daypart'] = 'evening'
    X.loc[(0<=X['datetime'].dt.hour) & (X['datetime'].dt.hour<5), 'daypart'] = 'night'
    X.loc[(5<=X['datetime'].dt.hour) & (X['datetime'].dt.hour<12), 'daypart'] = 'morning'
    X.loc[(12<=X['datetime'].dt.hour) & (X['datetime'].dt.hour<5), 'daypart'] = 'day'

In [None]:
for X in [X_train, X_test]:
    X.drop(columns='datetime', inplace=True)

#### Final touch

In [None]:
for X in [X_train, X_test]:
    X.drop(columns=['slug', 'name', 'tagline'], inplace=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

In [None]:
enc.fit(pd.concat([X_train, X_test]))

In [None]:
X_train = enc.transform(X_train)

In [None]:
X_test = enc.transform(X_test)

### Learn model

#### CatBoost Regressor

In [None]:
from catboost import CatBoostRegressor
from ipywidgets import interact  
import ipywidgets as widgets
import optuna

In [None]:
def objective(trial):
    params = {
        "iterations": 100,
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 1, log=True),
        "depth": trial.suggest_int("depth", 3, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20),
        "loss_function": "RMSE",
        "task_type": "CPU",
        "thread_count": 16,
        "verbose": 0,
        "bootstrap_type": "MVS"
    }
    model = CatBoostRegressor(**params)
    return cross_val_score(model, X, y, scoring='r2', cv = 5).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print('Best R2:', study.best_value)

params = study.best_params.copy()
params['iterations'] = 1000
params['verbose'] = 1

In [None]:
model = CatBoostRegressor(**params)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred = y_pred.astype(int)
y_pred[y_pred < 0] = 0

In [None]:
submission = pd.DataFrame(index = pd.read_csv('producthunt_test.csv').id, data = y_pred, columns=['votesCount'])
submission.to_csv('submission.csv')