In [1]:
import os
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import math
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from tqdm import tqdm_notebook as tqdm
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(suppress=True)

In [3]:
base_path = '/home/jovyan/work/Molecular_Properties'

In [4]:
structures = pd.read_csv(f'{base_path}/structures.csv')
mulliken = pd.read_csv(f'{base_path}/mulliken_charges.csv')

In [5]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.013,1.086,0.008
1,dsgdb9nsd_000001,1,H,0.002,-0.006,0.002
2,dsgdb9nsd_000001,2,H,1.012,1.464,0.0
3,dsgdb9nsd_000001,3,H,-0.541,1.448,-0.877
4,dsgdb9nsd_000001,4,H,-0.524,1.438,0.906


In [6]:
structures = structures.merge(mulliken, how = 'left', on = ['molecule_name', 'atom_index'])

In [7]:
structures_train = structures[pd.notna(structures['mulliken_charge'])]

In [8]:
for f in ['atom']:
    lbl = LabelEncoder()
    lbl.fit(list(structures_train[f].values) + list(structures_train[f].values))
    structures_train[f] = lbl.transform(list(structures_train[f].values))
    #df_test[f] = lbl.transform(list(df_test[f].values))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [9]:
X = structures_train.drop(['molecule_name', 'mulliken_charge'], axis=1)
Y = structures_train['mulliken_charge']

#X_test = df_test.drop(['id', 'molecule_name'], axis=1)

In [10]:
Y.describe()

count   1533537.000
mean         -0.000
std           0.225
min          -0.733
25%          -0.192
50%           0.099
75%           0.127
max           0.729
Name: mulliken_charge, dtype: float64

In [11]:
# Set up folds
K = 10
kf = KFold(n_splits = K, random_state = 1, shuffle = True)

In [12]:
### Train the Model
params = {'num_leaves': 128,
          'min_child_samples': 79,
          'objective': 'regression',
          'n_estimators': 1000,
          'max_depth': 9,
          'learning_rate': 0.1,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0
         }

In [13]:
scores = []
for i, (train_index, test_index) in enumerate(kf.split(structures_train)):

        # Create data for this fold
        Y_train, Y_valid = Y.iloc[train_index].copy(), Y.iloc[test_index].copy()
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
        
        print( f'\nFold: {i}')

        fit_model = lgb.LGBMRegressor(**params)
        fit_model.fit(X_train, Y_train)
        pred = fit_model.predict(X_valid)
        #submit_pred += fit_model.predict(X_test) / K
        # Save validation predictions for this fold
        print( "MAE: ", mean_absolute_error(Y_valid, pred))
        scores.append(mean_absolute_error(Y_valid, pred))
        
print(f'The mean score of a model for is: {np.mean(scores)}')


Fold: 0


KeyboardInterrupt: 

In [None]:
def train_model(df_train, df_test, model_type = None, params = None, gridParams = None):
        print(f'{model_type} will be trained ...')
        dict_score = dict()
        submission_list = list()
        for moltype in df_train['type'].unique():
            varImp = pd.read_csv(f'{base_path}/varImp/{model_type}/variable_importances_{moltype}.csv')
            varImp = varImp[varImp.Value >= 100]
            varImpCount = varImp["Feature"].nunique() 
            varImpUnique = list(varImp['Feature'].unique())
            varImpUnique.extend(['id', 'molecule_name', 'scalar_coupling_constant', 'type'])

            df_train_type = df_train[df_train.type == moltype].reset_index(drop = True)
            df_train_type = df_train_type[varImpUnique]

            varImpUnique.remove('scalar_coupling_constant')

            df_test_type = df_test[df_test.type == moltype].reset_index(drop = True)
            df_test_type = df_test_type[varImpUnique]

            print(f'{varImpCount} features have been chosen for modeling of the {moltype} type')

            df_train_type = df_train_type.drop(['type'],  axis=1)
            df_test_type = df_test_type.drop(['type'],  axis=1)
            X = df_train_type.drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1)
            Y = df_train_type['scalar_coupling_constant']
            X_test = df_test_type.drop(['id', 'molecule_name'], axis=1)
            print(moltype)
#             print(f'Hyper parameter optimization for {moltype} starts...')

#             mdl = lgb.LGBMRegressor(**params)

#             grid = GridSearchCV(estimator = mdl, 
#                                 param_grid = gridParams,
#                                 scoring='neg_mean_absolute_error',
#                                 verbose=50,
#                                 cv=2,
#                                 n_jobs=30)

#             grid.fit(X, Y)

#             maxdep = grid.best_params_["max_depth"]
#             learnrate = grid.best_params_["learning_rate"]
            
#             # Print the best parameters found
#             print(f' the best max_depth is {maxdep} and the best learning rate of {learnrate}')
#             print(f'the best score is {grid.best_score_}')

#             params['max_depth'] = grid.best_params_['max_depth']
#             params['learning_rate'] = grid.best_params_['learning_rate']

#             print(f'Hyper parameter optimization for {moltype} is finished. Model tarining starts')

#             gc.collect()      
            submit_pred = np.zeros(X_test.shape[0])
            scores = []
            for i, (train_index, test_index) in enumerate(kf.split(df_train_type)):

                # Create data for this fold
                Y_train, Y_valid = Y.iloc[train_index].copy(), Y.iloc[test_index].copy()
                X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()

                print( f'\nFold {moltype}: {i}')
            
                if model_type == 'LGBM':
                    fit_model = lgb.LGBMRegressor(**params)
                    fit_model.fit(X_train, Y_train)
                
                elif model_type == 'CatBoost':
                    fit_model = cat.CatBoostRegressor(**params)                                                    
                    fit_model.fit(X_train, Y_train,  verbose = False)
                else : 
                    print(f'This {model_type} is not yet supported!')
                
                pred = fit_model.predict(X_valid)
                submit_pred += fit_model.predict(X_test) / K
                # Save validation predictions for this fold
                print( "Group Log MAE: ", math.log(mean_absolute_error(Y_valid, pred)))
                scores.append(math.log(mean_absolute_error(Y_valid, pred)))
                

            print(f'The mean score of a model for {moltype} is: {np.mean(scores)}')
            dict_score[moltype] = np.mean(scores)

            #submit_pred = fit_model.predict(X_test)
            submit_pred= pd.DataFrame(submit_pred)
            submit_pred.columns = ['scalar_coupling_constant']
            sub = pd.concat([df_test_type['id'], submit_pred], axis = 1)
            submission_list.append(sub)
            # Show variable importance of a model
            #show_varimp(fit_model = fit_model, model_type = model_type, mol_type = moltype, X = X)

        print("Overall mean is ", np.array(list(dict_score.values())).mean())
        return [submission_list, dict_score]
    