In [1]:
import numpy as np
import pandas as pd
from sklearn import naive_bayes 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [4]:
#reading the data
file = "C:/Users/USERID/Documents/GitHub/Employment2030/Employment2030/tables/noc_answers.csv"

data = pd.read_csv(file,index_col=['noc','workshop.number'])
data.sort_index(inplace=True)
data.loc[data.share == 'remain constant','share'] = 'constant'
data.dropna(inplace=True)

In [5]:
#splitting up x and y
x = data.drop(['absolute','share','Unnamed: 0','noc_code'],axis=1) #making x data frame
x['work_num'] = x.index.get_level_values(1) #making workshop number a variable as well as an index
x.drop_duplicates(inplace=True)
x_round = np.round(x).astype(int)#rounded x (for multimial distribution)

y_abs = data['absolute'].copy() #y - absolute change
y_share = data['share'].copy() #y - proportional change

#finding most frequant answer for each noc-workshop for dependent variable
y_abs = pd.DataFrame(y_abs).pivot_table(index = ['noc','workshop.number'], columns = 'absolute', aggfunc = len).fillna(0)
y_abs['not_increase'] = y_abs['fewer'] + y_abs['same']
y_abs['y'] = y_abs[['fewer','more','same']].idxmax(axis=1)
y_abs['binned_y'] = y_abs[['more','not_increase']].idxmax(axis=1)

y_share = pd.DataFrame(y_share).pivot_table(index = ['noc','workshop.number'], columns = 'share', aggfunc = len).fillna(0)
y_share['not_increase'] = y_share['decrease'] + y_share['constant']
y_share['y'] = y_share[['constant','decrease','increase']].idxmax(axis=1)
y_share['binned_y'] = y_share[['increase','not_increase']].idxmax(axis=1)

In [6]:
selector = SelectKBest(mutual_info_classif)
gnb = naive_bayes.GaussianNB()
mnb = naive_bayes.MultinomialNB()

gnb_pipe = Pipeline([('selector', selector), ('classifier', gnb)])
mnb_pipe = Pipeline([('selector', selector), ('classifier', mnb)])

param_grid = {'selector__k': np.asarray(range(1,50,1))}
gnb_search = GridSearchCV(gnb_pipe, param_grid, iid=False, cv=5)
mnb_search = GridSearchCV(mnb_pipe , param_grid, iid=False, cv=5)

x_set = [x,x_round]
y_set = [y_abs['y'],y_share['y'],y_abs['binned_y'],y_share['binned_y']]
models = [gnb_search,mnb_search]
trained_models = np.empty(8,dtype = Pipeline)
scores = np.empty(8)

for i in range(2):
    for j in range(4):
        models[i].fit(x_set[i],y_set[j])
        trained_models[j+i*j] = models[i].best_estimator_
        scores[j+i*j] = models[i].best_score_

KeyboardInterrupt: 

In [35]:
#the selected features for multinomial model with absolute binned y
features = x_round.columns[trained_models[6].named_steps.selector.get_support()]
pd.DataFrame(np.exp(trained_models[6].named_steps.classifier.feature_log_prob_),columns=features,index=['more','not_increase'])

Unnamed: 0,value.Writing,value.Persuasion,value.Service Orientation,value.Technology Design,value.Operation Monitoring,value.Systems Analysis,value.Written Comprehension,value.Fluency of Ideas,value.Memorization,value.Arm-Hand Steadiness,value.Control Precision,value.Reaction Time,value.Static Strength,value.Gross Body Equilibrium,value.Administration and Management,value.Clerical,value.Customer and Personal Service,value.Computers and Electronics,value.Engineering and Technology,value.Law and Government
more,0.058789,0.054715,0.059371,0.041327,0.048894,0.055879,0.065774,0.054715,0.047148,0.047148,0.041909,0.035506,0.030268,0.028522,0.056461,0.05064,0.066938,0.06461,0.04773,0.043655
not_increase,0.06056,0.050862,0.053233,0.031466,0.054095,0.049784,0.067888,0.050862,0.042026,0.050647,0.050431,0.042672,0.041595,0.034914,0.058836,0.052802,0.059698,0.054095,0.041164,0.052371


In [18]:
scores

array([6.77318841e-001, 5.37492754e-001, 5.10362319e-001, 7.56826087e-001,
       6.58434783e-001, 3.45845952e-322, 7.89826087e-001, 3.90311860e-322])