In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import naive_bayes 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [5]:
#reading the data
file = "/home/joshua/Downloads/Employment2030-master/Employment2030/tables/scores_answers.csv"

data = pd.read_csv(file,index_col=['noc','workshop.number'])
data.sort_index(inplace=True)
data.loc[data.share == 'remain constant','share'] = 'constant'
data.dropna(inplace=True)

In [6]:
#splitting up x and y
x = data.drop(['absolute','share','Unnamed: 0','noc_code'],axis=1) #making x data frame
x['work_num'] = x.index.get_level_values(1) #making workshop number a variable as well as an index
x.drop_duplicates()
x_round = np.round(x)#rounded x (for multimial distribution)
y_abs = data['absolute'].copy() #y - absolute change
y_share = data['share'].copy() #y - proportional change



#binary y's
y_abs_bin = y_abs.copy()
y_abs_bin.loc[y_abs_bin.isin(['fewer','same'])] = 'not_increase'
y_share_bin = y_share.copy()
y_share_bin.loc[y_share_bin.isin(['decrease','constant'])] = 'not_increase'

In [7]:
#answer proportions to check learned probabillities againts
abs_answer = pd.DataFrame(y_abs).pivot_table(index = ['noc','workshop.number'], columns = 'absolute', aggfunc = len).fillna(0)
abs_answer['sum'] = abs_answer.sum(axis = 1)
abs_answer['not_increase'] = abs_answer['fewer'] + abs_answer['same']
abs_answer.loc[:,abs_answer.columns!='sum'] = abs_answer.loc[:,abs_answer.columns!='sum'].divide(abs_answer['sum'],axis=0)

share_answer = pd.DataFrame(y_share).pivot_table(index = ['noc','workshop.number'], columns = 'share', aggfunc = len).fillna(0)
share_answer['sum'] = share_answer.sum(axis = 1)
share_answer['not_increase'] = share_answer['decrease'] + share_answer['constant']
share_answer.loc[:,share_answer.columns!='sum'] = share_answer.loc[:,share_answer.columns!='sum'].divide(share_answer['sum'],axis=0)

In [8]:
selector = SelectKBest()
gnb = naive_bayes.GaussianNB()
mnb = naive_bayes.MultinomialNB()

gnb_pipe = Pipeline([('selector', selector), ('classifier', gnb)])
mnb_pipe = Pipeline([('selector', selector), ('classifier', mnb)])

param_grid = {'selector__k': np.asarray(range(5,50,5)),
              'selector__score_func': [chi2,mutual_info_classif]
             }
gnb_search = GridSearchCV(gnb_pipe, param_grid, iid=False, cv=5)
mnb_search = GridSearchCV(mnb_pipe , param_grid, iid=False, cv=5)

gnb_search.fit(x, y_abs)
gnb_abs_model = gnb_search.best_estimator_

gnb_search.fit(x, y_abs_bin)
gnb_abs_bin_model = gnb_search.best_estimator_

gnb_search.fit(x, y_share)
gnb_share_model = gnb_search.best_estimator_

gnb_search.fit(x, y_share_bin)
gnb_share_bin_model = gnb_search.best_estimator_

mnb_search.fit(x_round, y_abs)
mnb_abs_model = gnb_search.best_estimator_

mnb_search.fit(x_round, y_abs_bin)
mnb_abs_bin_model = mnb_search.best_estimator_

mnb_search.fit(x_round, y_share)
mnb_share_model = gnb_search.best_estimator_

mnb_search.fit(x_round, y_share_bin)
mnb_share_bin_model = mnb_search.best_estimator_

In [30]:
#Looping through the different model setups and running them on random testing/ training sets 100 times.
#outputs are the mean accuracies for each model
#randomly select 80% of noc-workshop cobination as a testing set:

occ_workshop = abs_answer.index
results = np.zeros([100,2,4]) #structure to put the main in

for i in range(1):
    train_occ_worshops = np.random.choice(occ_workshop,size=int(len(occ_workshop)*0.8),replace=False)
    test_occ_worshops = occ_workshop[np.logical_not(occ_workshop.isin(train_occ_worshops))]

    #setting different x and y setups to iterate through
    x_train = [x.loc[train_occ_worshops], x_round.loc[train_occ_worshops]]
    x_test = [x.loc[test_occ_worshops], x_round.loc[test_occ_worshops]]

    y_train = [y_abs.loc[train_occ_worshops], y_share.loc[train_occ_worshops],
           y_abs_bin.loc[train_occ_worshops], y_share_bin.loc[train_occ_worshops]]
    y_test = [abs_answer.loc[test_occ_worshops,['fewer','more','same']], 
           share_answer.loc[test_occ_worshops,['constant','decrease','increase']],
           abs_answer.loc[test_occ_worshops,['more','not_increase']], 
           share_answer.loc[test_occ_worshops,['increase','not_increase']]]
    
    models = [gnb_abs_model,gnb_abs_bin_model,gnb_share_model,gnb_share_bin_model,
             mnb_abs_model, mnb_abs_bin_model, mnb_share_model, mnb_share_bin_model]
    for j in range(2):
        for k in range(4):
            model = models[k+j*k].fit(x_train[j],y_train[k])
            pred = pd.DataFrame(model.predict(x_test[j])).set_index(x_test[j].index).loc[~x_test[j].index.duplicated(keep='first')]
            results[i,j,k] = sum(np.asarray(y_test[k].idxmax(axis=1))==np.asarray(pred[0]))/y_test[k].shape[0]

In [36]:
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,0
noc,workshop.number,Unnamed: 2_level_1
Accommodation service managers,4,increase
"Air pilots, flight engineers and flying instructors",3,not_increase
Air transport ramp attendants,3,not_increase
Computer network technicians,1,increase
Computer network technicians,3,increase
Correctional service officers,3,increase
Electrical mechanics,1,increase
Electrical mechanics,2,increase
Electrical mechanics,5,increase
Financial managers,2,increase


In [38]:
y_test[k].idxmax(axis=1)

noc                                                                      workshop.number
Accommodation service managers                                           4                  not_increase
Air pilots, flight engineers and flying instructors                      3                  not_increase
Air transport ramp attendants                                            3                  not_increase
Computer network technicians                                             1                      increase
                                                                         3                      increase
Correctional service officers                                            3                  not_increase
Electrical mechanics                                                     1                  not_increase
                                                                         2                      increase
                                                                       