# <font color='SlateBlue'>FUNCTION</font>

## SVM

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import svm

def svm_objective(trial):
  params = {'kernel': trial.suggest_categorical('kernel', ['linear','rbf','sigmoid']),
            'C': trial.suggest_loguniform('C', 1e+0, 1e+2/2),
            'gamma': trial.suggest_loguniform('gamma', 1e-3, 3.0),}

  model = svm.SVC(**params)
  
  svm_scores = cross_val_score(clf, train_data, state, scoring='f1', cv=10)

  return svm_scores.mean()

## RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def rf_objective(trial):
  params = {'criterion': trial.suggest_categorical('criterion', ['mse', 'mae']),
            'bootstrap': trial.suggest_categorical('bootstrap',['True','False']),
            'max_depth': trial.suggest_int('max_depth', 1, 1000),
            'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt','log2']),
            'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 1,1000),
            'n_estimators': trial.suggest_int('n_estimators', 1, 1000),
            'min_samples_split': trial.suggest_int('min_samples_split',2,5),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf',1,10)}
            
  model = RandomForestClassifier(**params)
  
  rfc_scores = cross_val_score(clf, train_data, state, scoring='f1', cv=10)

  return rfc_scores.mean()

## LightGBM

In [None]:
import lightgbm as lgb

def lgb_objective(trial):
  params = {'boosting_type':'gbdt',
            'max_depth':-1,
            'learning_rate':0.1,
            'n_estimators': 1000,
            'metric':'l2',     
            'num_leaves': trial.suggest_int('num_leaves', 10, 300),
            'reg_alpha': trial.suggest_loguniform('reg_alpha',0.001, 10),
            'reg_lambda':trial.suggest_loguniform('reg_lambda', 0.001, 10)}
  
  model = lgb.LGBMClassifier(**params,random_state=0)

  lgb_scores = cross_val_score(clf, train_data, state, scoring='f1', cv=10)

  return lgb_scores.mean()

# <font color='SlateBlue'>STATIC</font>

In [None]:
TRAIN_PATH = '/content/drive/MyDrive/SIGNATE/DATASET/train.csv'
TEST_PATH = '/content/drive/MyDrive/SIGNATE/DATASET/test.csv'
SAMPLE_SUBMISSION_PATH = '/content/drive/MyDrive/SIGNATE/DATASET/sample_submit.csv'

# <font color='SlateBlue'>IMPORT DATA</font>

In [None]:
import pandas as pd

df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

print('df_train shape: {0}'.format(df_train.shape))
print('df_test shape: {0}'.format(df_test.shape))

df_train shape: (10545, 8)
df_test shape: (10544, 7)


In [None]:
df_train.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state
0,0,4001-5000,CH,29,publishing,young adult,"<div class=""contents""><div><span class=""bold"">...",0
1,1,3001-4000,NL,34,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0
2,2,19001-20000,US,30,food,spaces,"<div class=""contents""><div><p> As our society ...",0
3,3,2001-3000,US,41,technology,3d printing,"<div class=""contents""><div><p>My name is Donal...",0
4,4,2001-3000,GB,29,technology,diy electronics,"<div class=""contents""><div><div class=""templat...",1


In [None]:
df_test.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content
0,10545,1-1000,US,60,dance,performances,"<div class=""contents""><div><p>We are presentin..."
1,10546,8001-9000,US,30,crafts,printing,"<div class=""contents""><div><a href=""http://dum..."
2,10547,9001-10000,US,60,design,product design,"<div class=""contents""><div><div class=""templat..."
3,10548,1001-2000,US,30,technology,software,"<div class=""contents""><div><p>This is my video..."
4,10549,4001-5000,US,59,technology,software,"<div class=""contents""><div><h1 class=""page-anc..."


# <font color='SlateBlue'>EDA</font>

## 型

In [None]:
print(df_train.dtypes)

id               int64
goal            object
country         object
duration         int64
category1       object
category2       object
html_content    object
state            int64
dtype: object


In [None]:
print(df_test.dtypes)

id               int64
goal            object
country         object
duration         int64
category1       object
category2       object
html_content    object
dtype: object


## 欠損値

In [None]:
print(df_train.isnull().sum())

id              0
goal            0
country         0
duration        0
category1       0
category2       0
html_content    0
state           0
dtype: int64


In [None]:
print(df_test.isnull().sum())

id              0
goal            0
country         0
duration        0
category1       0
category2       0
html_content    0
dtype: int64


## 要素

In [None]:
print(df_train.nunique())

id              10545
goal               99
country            22
duration           73
category1          15
category2         144
html_content    10470
state               2
dtype: int64


In [None]:
print(df_test.nunique())

id              10544
goal              100
country            22
duration           68
category1          15
category2         145
html_content    10475
dtype: int64


# <font color='SlateBlue'>MODIFY DATAFRAME FOR TRAINING MODEL</font>

## About html_content

In [None]:
df_train = df_train.drop('html_content', axis=1)
df_train_state = df_train['state']
df_train = df_train.drop('state', axis=1)
df_test = df_test.drop('html_content', axis=1)

df = pd.concat([df_train, df_test], axis=0)

print('df_train shape: {0}'.format(df_train.shape))
print('df_test shape: {0}'.format(df_test.shape))
print('df shape: {0}'.format(df.shape))

df_train shape: (10545, 6)
df_test shape: (10544, 6)
df shape: (21089, 6)


## One hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(df['goal'].values.reshape(-1, 1))
train_goal = one_hot_encoder.transform(df_train['goal'].values.reshape(-1, 1))
test_goal = one_hot_encoder.transform(df_test['goal'].values.reshape(-1, 1))
print('train_goal shape: {0}'.format(train_goal.shape))
print('test_goal shape: {0}'.format(test_goal.shape))

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(df['country'].values.reshape(-1, 1))
train_country = one_hot_encoder.transform(df_train['country'].values.reshape(-1, 1))
test_country = one_hot_encoder.transform(df_test['country'].values.reshape(-1, 1))
print('train_country shape: {0}'.format(train_country.shape))
print('test_country shape: {0}'.format(test_country.shape))

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(df['duration'].values.reshape(-1, 1))
train_duration = one_hot_encoder.transform(df_train['duration'].values.reshape(-1, 1))
test_duration = one_hot_encoder.transform(df_test['duration'].values.reshape(-1, 1))
print('train_duration shape: {0}'.format(train_duration.shape))
print('test_duration shape: {0}'.format(test_duration.shape))

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(df['category1'].values.reshape(-1, 1))
train_category1 = one_hot_encoder.transform(df_train['category1'].values.reshape(-1, 1))
test_category1 = one_hot_encoder.transform(df_test['category1'].values.reshape(-1, 1))
print('train_category1 shape: {0}'.format(train_category1.shape))
print('test_category1 shape: {0}'.format(test_category1.shape))

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(df['category2'].values.reshape(-1, 1))
train_category2 = one_hot_encoder.transform(df_train['category2'].values.reshape(-1, 1))
test_category2 = one_hot_encoder.transform(df_test['category2'].values.reshape(-1, 1))
print('train_category2 shape: {0}'.format(train_category2.shape))
print('test_category2 shape: {0}'.format(test_category2.shape))

state = df_train_state.values.reshape(-1, 1).ravel()
print('state shape: {0}'.format(state.shape))

import numpy as np

train_data = np.concatenate([train_goal, train_country, train_duration, train_category1, train_category2], axis=1)
test_data = np.concatenate([test_goal, test_country, test_duration, test_category1, test_category2], axis=1)
print('train_data: {0}'.format(train_data.shape))
print('test_data: {0}'.format(test_data.shape))

train_goal shape: (10545, 101)
test_goal shape: (10544, 101)
train_country shape: (10545, 22)
test_country shape: (10544, 22)
train_duration shape: (10545, 76)
test_duration shape: (10544, 76)
train_category1 shape: (10545, 15)
test_category1 shape: (10544, 15)
train_category2 shape: (10545, 145)
test_category2 shape: (10544, 145)
state shape: (10545,)
train_data: (10545, 359)
test_data: (10544, 359)


# <font color='Slateblue'>PREDICTION

In [None]:
# from sklearn import svm

# clf = svm.SVC()

# clf.fit(train_data, state)

# predicted = clf.predict(test_data)

# df_sub = pd.concat([df_test['id'], pd.DataFrame(predicted, columns=['predicted'])], axis=1)
# print(df_sub.dtypes)
# print('df_sub shape: {0}'.format(df_sub.shape))
# df_sub.head()

In [None]:
# df_sub.to_csv('submission.csv', index=False, header=False)

# <font color='red'>Model Evaluate</font>

# <font color='SlateBlue'>Optuna</font>

In [None]:
!pip -q -q -q install optuna
import optuna

[K     |████████████████████████████████| 286kB 4.2MB/s 
[K     |████████████████████████████████| 163kB 48.8MB/s 
[K     |████████████████████████████████| 81kB 7.3MB/s 
[K     |████████████████████████████████| 81kB 9.3MB/s 
[K     |████████████████████████████████| 133kB 50.9MB/s 
[K     |████████████████████████████████| 112kB 52.0MB/s 
[K     |████████████████████████████████| 51kB 2.4MB/s 
[?25h  Building wheel for PrettyTable (setup.py) ... [?25l[?25hdone
  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


## SVM

|SVM|mean|standard deviation|Private LB|
|:--:|:--:|:--:|:--:|
|default|0.735|0.017|0.732|
|Optuna (sample)|0.733|||

|SVM|kernel|C|gamma|gamma||||
|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
|Optuna (sample)|rbf|7.560|0.002|||||

In [None]:
# svm_study = optuna.create_study(direction='maximize')
# svm_study.optimize(svm_objective, n_trials=100)
# svm_trial = svm_study.best_trial

# print('SVM best f1 score: {0}'.format(svm_trial.value))
# print('SVM Best hyperparameters: {0}'.format(svm_trial.params))

## RandomForest

|RandomForest|mean|standard deviation|
|:--:|:--:|:--:|
|default|0.726|0.014|
|Optuna (sample)|0.733||

|RandomForest|criterion|bootstrap|max_depth|max_features|max_leaf_nodes|n_estimators|min_samples_split|min_samples_leaf|
|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
|Optuna (sample)|mse|False|988|log2|413|691|4|8|

In [None]:
# rf_study = optuna.create_study(direction='maximize')
# rf_study.optimize(rf_objective, n_trials=100)
# rf_trial = rf_study.best_trial

# print('RandomForest best f1 score: {0}'.format(rf_trial.value))
# print('RandomForest best hyperparameters: {0}'.format(rf_trial.params))

## LightGBM

|LightGBM|mean|standard deviation|
|:--:|:--:|:--:|
|default|0.737|0.017|
|Optuna (sample)|0.734||

|LightGBM|num_leaves|reg_alpha|reg_lambda||||||
|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
|Optuna (sample)|258|0.008|3.027||||||

In [None]:
# lgb_study = optuna.create_study(direction='maximize')
# lgb_study.optimize(lgb_objective, n_trials=100)
# lgb_trial = lgb_study.best_trial

# print('LightGBM best f1 score: {0}'.format(lgb_trial.value))
# print('LightGBM best hyperparameters: {0}'.format(lgb_trial.params))

# <font color='Slateblue'>PREVENT TIMEOUT</font>

```javascript
function ClickConnect(){
 
console.log("Working"); 
document.querySelector("#comments > span").click() 
}
setInterval(ClickConnect,500000)
```