In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import lightgbm as lgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier


In [3]:
def scoreknn(params,X,y):
#     print("Training with params: ")
#     print(params)
    model = KNeighborsClassifier(**params)
    cv = StratifiedKFold(n_splits=5,random_state=42)
    scores = cross_val_score(model, X,y,cv=cv)
    # TODO: Add the importance for the selected features
#     print("Scores: %.4f +/- %.4f" % (scores.mean(),1.96*scores.std()))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - (scores.mean() - (1.96*scores.std()))
    return {'loss': loss, 'status': STATUS_OK}


def optimizeknn(X,y):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space ={
        'n_neighbors' : hp.choice('n_neighbors', np.arange(2,150)),
        'weights' : hp.choice('weight',['uniform', 'distance']),
        'leaf_size' : hp.choice('leaf_size', np.arange(25,100)),
        'p' : hp.choice('p', np.arange(1,5)),
        'n_jobs' : -1        
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(lambda par: scoreknn(par,X,y), space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=1000)
    return best

In [4]:
def scoresvm(params,X,y):
    print("Training with params: ")
    print(params)
    model = SVC(**params)
    cv = StratifiedKFold(n_splits=5,random_state=42)
    scores = cross_val_score(model, X,y,cv=cv)
    # TODO: Add the importance for the selected features
    print("Scores: %.4f +/- %.4f" % (scores.mean(),1.96*scores.std()))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - (scores.mean() - (1.96*scores.std()))
    return {'loss': loss, 'status': STATUS_OK}


def optimizesvm(X,y):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space ={
        'C' : hp.quniform('C', 0.01,10,0.01),
        'gamma' : hp.quniform('gamma',0.001,1,0.001),
        'decision_function_shape' : hp.choice('decision_function_shape', ['ovr','ovo']),
        'random_state' : 3
        
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(lambda par: scoresvm(par,X,y), space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=100)
    return best

In [5]:
def scorexgb(params,X,y):
    print("Training with params: ")
    params["n_estimators"] = int(params["n_estimators"])
    print(params)
    model = XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5,random_state=42)
    scores = cross_val_score(model, X,y,cv=cv)
    # TODO: Add the importance for the selected features
    print("Scores: %.4f +/- %.4f" % (scores.mean(),1.96*scores.std()))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - (scores.mean() - (1.96*scores.std()))
    return {'loss': loss, 'status': STATUS_OK}

def optimizexgb(X,y):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space ={
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'merror',
        'objective': 'multi:softmax',
        'n_class' : 3,
        # Increase this number if you have more cores. Otherwise, remove it and it will default 
        # to the maxium number. 
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': 12
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(lambda par: scorexgb(par,X,y), space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=100)
    return best

In [6]:
def score(params,X,y):
    print("Training with params: ")
    params["n_estimators"] = int(params["n_estimators"])
    params["max_bin"] = int(params["max_bin"])
    params['num_leaves'] = int(params['num_leaves'])
    params['bagging_freq'] = int(params['bagging_freq'])
    print(params)
    model = lgb.LGBMClassifier(**params)
    cv = StratifiedKFold(n_splits=5,random_state=42)
    scores = cross_val_score(model, X,y,cv=cv)
    # TODO: Add the importance for the selected features
    print("Scores: %.4f +/- %.4f" % (scores.mean(),1.96*scores.std()))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - (scores.mean() - (1.96*scores.std()))
    return {'loss': loss, 'status': STATUS_OK}




def optimize(X,y):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
    'max_bin' : hp.quniform('max_bin', 255,750,1),
    'num_leaves' : hp.quniform('num_leaves', 31, 75,1),
    'bagging_fraction' : hp.quniform('bagging_fraction', 0.2,0.9,0.025),
    'bagging_freq' : hp.quniform('bagging_freq', 2,100,1),
    'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1.0, 0.025),
    'lambda_l1' : hp.quniform('lambda_l1', 0,5,0.5),
    'lambda_l2' : hp.quniform('lambda_l2', 0,5,0.5),
    'boosting_type': hp.choice('boosting_type', ['gbdt','dart']),
    'learning_rate' : hp.quniform('learning_rate', 0.025, 0.5, 0.025),
    'max_depth' :  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05)
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(lambda par: score(par,X,y), space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=100)
    return best

In [8]:
data_train = pd.read_csv('train_final4.csv')
data_train.head()

Unnamed: 0,id,fac_1,fac_2,fac_3,fac_4,fac_5,fac_6,fac_7,fac_8,poi_1,...,diff_fac_1,diff_fac_2,diff_fac_3,diff_fac_4,diff_fac_5,diff_fac_6,diff_fac_7,diff_fac_8,diff_poi,diff_poi_bin
0,3057,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,9928,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1752,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,10219,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0
2,1776,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,10303,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2404,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9912,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0
4,2263,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,10471,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [9]:
feat_use = ['fac_3',
 'fac_4',
 'fac_5',
 'fac_6',
 'fac_8',
 'poi_1',
 'poi_2',
 'poi_3',
 'price_monthly',
 'price*call',
 'price_bin',
 'size_bin',
 'room_bin',
 'fac_1_campur',
 'fac_3_campur',
 'fac_3_putra',
 'fac_3_putri',
 'fac_4_campur',
 'fac_4_putra',
 'fac_4_putri',
 'fac_5_putra',
 'fac_8_campur',
 'fac_8_putri']

In [5]:
data_train.drop('id', axis=1, inplace=True)
# data_train.drop(feature_drop,axis=1,inplace=True)

In [10]:
X = data_train[feat_use].values
y = data_train['gender'].values

In [12]:
# scaler = MinMaxScaler()
# pc = PCA(n_components=0.99,svd_solver='full')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)

In [14]:
# X_train = pc.fit_transform(X_train)

In [15]:
# X_train


In [16]:
optimizesvm(X_train,y_train)

Training with params: 
{'C': 3.36, 'decision_function_shape': 'ovr', 'gamma': 0.171, 'random_state': 3}
  0%|                                                                            | 0/100 [00:00<?, ?it/s, best loss: ?]


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-b9ed30805d5e>", line 1, in <module>
    optimizesvm(X_train,y_train)
  File "<ipython-input-3-a52a8536f945>", line 32, in optimizesvm
    max_evals=100)
  File "C:\ProgramData\Anaconda3\lib\site-packages\hyperopt\fmin.py", line 407, in fmin
    rval.exhaust()
  File "C:\ProgramData\Anaconda3\lib\site-packages\hyperopt\fmin.py", line 262, in exhaust
    self.run(self.max_evals - n_done, block_until_done=self.asynchronous)
  File "C:\ProgramData\Anaconda3\lib\site-packages\hyperopt\fmin.py", line 227, in run
    self.serial_evaluate()
  File "C:\ProgramData\Anaconda3\lib\site-packages\hyperopt\fmin.py", line 141, in serial_evaluate
    result = self.domain.evaluate(spec, ctrl)
  File "C:\ProgramData\Anaconda3\lib\site-packages\hyperopt\base.py", line 844, in evaluate


KeyboardInterrupt: 

In [17]:
optimize(X_train,y_train)

Training with params: 
{'bagging_fraction': 0.275, 'bagging_freq': 19, 'boosting_type': 'gbdt', 'colsample_bytree': 0.6000000000000001, 'feature_fraction': 0.8500000000000001, 'lambda_l1': 3.5, 'lambda_l2': 4.5, 'learning_rate': 0.42500000000000004, 'max_bin': 602, 'max_depth': 9, 'min_child_weight': 2.0, 'n_estimators': 269, 'num_leaves': 48, 'subsample': 0.5}
Scores: 0.6006 +/- 0.0117
Training with params: 
{'bagging_fraction': 0.8500000000000001, 'bagging_freq': 13, 'boosting_type': 'dart', 'colsample_bytree': 0.8500000000000001, 'feature_fraction': 0.925, 'lambda_l1': 0.5, 'lambda_l2': 1.0, 'learning_rate': 0.25, 'max_bin': 265, 'max_depth': 13, 'min_child_weight': 3.0, 'n_estimators': 140, 'num_leaves': 61, 'subsample': 0.8}
Scores: 0.6403 +/- 0.0181
Training with params: 
{'bagging_fraction': 0.2, 'bagging_freq': 70, 'boosting_type': 'gbdt', 'colsample_bytree': 0.8500000000000001, 'feature_fraction': 0.625, 'lambda_l1': 0.5, 'lambda_l2': 5.0, 'learning_rate': 0.2, 'max_bin': 533,

{'bagging_fraction': 0.8500000000000001,
 'bagging_freq': 13.0,
 'boosting_type': 1,
 'colsample_bytree': 0.8500000000000001,
 'feature_fraction': 0.925,
 'lambda_l1': 0.5,
 'lambda_l2': 1.0,
 'learning_rate': 0.25,
 'max_bin': 265.0,
 'max_depth': 12,
 'min_child_weight': 3.0,
 'n_estimators': 140.0,
 'num_leaves': 61.0,
 'subsample': 0.8}

In [18]:
1- 0.378266484204294

0.621733515795706

In [12]:
para = {'bagging_fraction': 0.7000000000000001,
 'bagging_freq': 57,
 'boosting_type': 'dart',
 'colsample_bytree': 0.9500000000000001,
 'feature_fraction': 0.8,
 'lambda_l1': 4.0,
 'lambda_l2': 2.5,
 'learning_rate': 0.125,
 'max_bin': 561,
 'max_depth': 6,
 'min_child_weight': 3.0,
 'n_estimators': 609,
 'num_leaves': 62,
 'subsample': 0.55}
model = lgb.LGBMClassifier(**para)

In [13]:
cv =StratifiedKFold(n_splits=5,random_state=42)
score = cross_val_score(model,X_train,y_train,cv=cv)

In [14]:
print("Score: %.5f +/- %.5f" % (score.mean(), 1.96*score.std()))

Score: 0.62940 +/- 0.01452


In [15]:
model.fit(X_train,y_train)

LGBMClassifier(bagging_fraction=0.7000000000000001, bagging_freq=57,
        boosting_type='dart', class_weight=None,
        colsample_bytree=0.9500000000000001, feature_fraction=0.8,
        importance_type='split', lambda_l1=4.0, lambda_l2=2.5,
        learning_rate=0.125, max_bin=561, max_depth=6,
        min_child_samples=20, min_child_weight=3.0, min_split_gain=0.0,
        n_estimators=609, n_jobs=-1, num_leaves=62, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=0.55, subsample_for_bin=200000, subsample_freq=0)

In [16]:
model.score(X_train,y_train)

0.83151073309032

In [17]:
model.score(X_test,y_test)

0.6419902912621359

In [18]:
from sklearn.metrics import confusion_matrix

In [19]:
y_hat = model.predict(X_test)
confusion_matrix(y_test,y_hat)

array([[ 79,  24,  49],
       [ 17,  92, 130],
       [ 28,  47, 358]], dtype=int64)

In [50]:
np.array([[ 84,  14,  54],
       [ 21,  90, 128],
       [ 25,  74, 334]], dtype=np.int64)

array([[ 84,  14,  54],
       [ 21,  90, 128],
       [ 25,  74, 334]], dtype=int64)

In [51]:
np.array([[ 95,  18,  39],
       [ 14, 106, 119],
       [ 34,  74, 325]], dtype=np.int64)

array([[ 95,  18,  39],
       [ 14, 106, 119],
       [ 34,  74, 325]], dtype=int64)

In [29]:
data_test = pd.read_csv('test_bagi_1.csv')
data_test.shape

(824, 234)

In [30]:
data_test.head()

Unnamed: 0,id,fac_1_campur,fac_3_campur,fac_3_putra,fac_3_putri,fac_4_campur,fac_4_putra,fac_4_putri,fac_5_putra,fac_8_campur,...,poi_3 / poi_1,fac_8_putri / fac_4_putri,poi_2 / fac_1_campur,fac_4_putri / fac_8_campur,price*call / fac_4_putri,fac_4_campur / fac_4_putra,fac_4_putra / fac_4_putri,fac_1_campur / poi_1,poi_2 / fac_3_campur,fac_8_putri / fac_4_putra
0,3294,0.13832,0.192504,0.325951,0.481545,0.18957,0.367865,0.442565,0.388793,0.199881,...,0.761959,1.078026,64010.94273,2.214143,3163376.0,0.515326,0.83121,5.3e-05,45993.7876,1.296936
1,3295,0.318396,0.177943,0.245602,0.576455,0.181868,0.226923,0.591209,0.235831,0.199881,...,1.117522,0.806986,11030.2815,2.957804,60892190.0,0.801453,0.383829,4.8e-05,19736.63883,2.102461
2,3296,0.13832,0.177943,0.245602,0.576455,0.181868,0.226923,0.591209,0.235831,0.199881,...,1.059023,0.806986,6383.743216,2.957804,5920074.0,0.801453,0.383829,1.3e-05,4962.258567,2.102461
3,3297,0.13832,0.177943,0.245602,0.576455,0.181868,0.226923,0.591209,0.235831,0.199881,...,1.058912,0.806986,6333.135965,2.957804,7104089.0,0.801453,0.383829,1.3e-05,4922.920164,2.102461
4,3298,0.13832,0.192504,0.325951,0.481545,0.18957,0.367865,0.442565,0.235831,0.199881,...,2.422058,1.078026,73770.91254,2.214143,61685830.0,0.515326,0.83121,8.5e-05,53006.61945,1.296936


In [275]:
data_test.drop(["id"], axis=1,inplace=True)

In [31]:
X_pred = data_test[feat_use].values

In [32]:
X_pred.shape

(824, 112)

In [33]:
y_pred = model.predict(X_pred)

In [34]:
y_pred

array(['putri', 'putri', 'putri', 'putri', 'putra', 'putri', 'putri',
       'putri', 'campur', 'putra', 'putra', 'putri', 'putri', 'campur',
       'putri', 'putra', 'putri', 'putra', 'putri', 'putri', 'putri',
       'putri', 'putri', 'putri', 'putri', 'putri', 'putri', 'putri',
       'putri', 'putra', 'putri', 'putri', 'putra', 'putra', 'putra',
       'campur', 'putra', 'putra', 'putri', 'putri', 'putra', 'putra',
       'putri', 'campur', 'putri', 'putri', 'putri', 'campur', 'putri',
       'putri', 'putri', 'putra', 'putra', 'putri', 'campur', 'campur',
       'putra', 'campur', 'campur', 'campur', 'putri', 'putra', 'putra',
       'putri', 'putri', 'campur', 'putri', 'putri', 'putri', 'campur',
       'putri', 'campur', 'campur', 'campur', 'campur', 'campur',
       'campur', 'putra', 'campur', 'putri', 'putri', 'putri', 'putri',
       'putri', 'campur', 'putri', 'putri', 'putri', 'putri', 'putri',
       'putra', 'putri', 'putri', 'putri', 'putri', 'putri', 'putri',
       'p

In [35]:
sample_subs = pd.read_csv('D:/Lomba/JOINTS 2019/Soal/Sample_submission.csv',delimiter=';')
sample_subs.head()

Unnamed: 0,id,gender
0,3294,putri
1,3295,putri
2,3296,putri
3,3297,putri
4,3298,putra


In [36]:
sample_subs['gender'] = y_pred

In [40]:
sample_subs.to_csv('truly_truly_lgbm_last_hope.csv', index=False)

In [38]:
sample_subs.head()

Unnamed: 0,id,gender
0,3294,putri
1,3295,putri
2,3296,putri
3,3297,putri
4,3298,putra


In [39]:
sample_subs.groupby('gender').count()

Unnamed: 0_level_0,id
gender,Unnamed: 1_level_1
campur,141
putra,182
putri,501


In [39]:
sample_subs = sample_subs.set_index('id')

In [44]:
sample_subs.loc[3294] = 'putri'

In [46]:
trans = pd.read_csv('kosan sama test.txt')

In [51]:
len(trans)

91

In [52]:
trans.head()

Unnamed: 0,ini,itu
0,3638,3639
1,3544,3545
2,3831,3832
3,3444,3445
4,3790,3908


In [54]:
for i in range(91):
    sample_subs.loc[trans.loc[i,'itu']] = sample_subs.loc[trans.loc[i,'ini']]

In [57]:
sample_subs['id'] = sample_subs.index

In [60]:
sample_subs.groupby('gender').count()

Unnamed: 0_level_0,id
gender,Unnamed: 1_level_1
campur,141
putra,196
putri,487


In [61]:
sample_subs.drop('id',axis=1,inplace=True)

In [62]:
sample_subs.to_csv('truly_last_hope.csv')