In [1]:
# all imports
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [4]:
# reading the dataset: salary_table.csv
sal_data = pd.read_csv('e:\salary_table.csv')

# printing first 5 rows
sal_data.head()

Unnamed: 0,salary,experience,education,management
0,13876,1,Bachelor,Y
1,11608,1,Ph.D,N
2,18701,1,Ph.D,Y
3,11283,1,Master,N
4,11767,1,Ph.D,N


In [5]:
# copying the data in another dataframe for modification and further use
df = sal_data.copy()

In [6]:
# converting the education types to codes
# Bacherlor as  0
# Ph.D      as  1
# Master    as  2
edu_fact = pd.factorize(df['education'])[0]

# adding a column of education codes to the dataframe
df['edu_code'] = edu_fact

# converting the education types to codes
# Y as 0
# N as 1
manag_fact = pd.factorize(df['management'])[0]

# adding a column of management codes to the dataframe
df['manag_code'] = manag_fact

# first 5 rows
df.head()

Unnamed: 0,salary,experience,education,management,edu_code,manag_code
0,13876,1,Bachelor,Y,0,0
1,11608,1,Ph.D,N,1,1
2,18701,1,Ph.D,Y,1,0
3,11283,1,Master,N,2,1
4,11767,1,Ph.D,N,1,1


In [11]:
# dividing the dataset into training and testing dataset
# adding a new column is_train with boolean values
# True - training data
# False - testing data
df['is_train'] = np.random.uniform(0, 1, len(df)) < 0.75

df.tail()

Unnamed: 0,salary,experience,education,management,edu_code,manag_code,is_train
41,27837,16,Master,Y,2,0,False
42,18838,16,Master,N,2,1,True
43,17483,16,Bachelor,N,0,1,True
44,19207,17,Master,N,2,1,False
45,19346,20,Bachelor,N,0,1,True


In [12]:
# making training dataset and testing dataset
train, test = df[df['is_train']==True], df[df['is_train']==False]

# length of training and test data
print('No of training data:', len(train))
print('No of testing data:', len(test))

No of training data: 36
No of testing data: 10


In [13]:
train

Unnamed: 0,salary,experience,education,management,edu_code,manag_code,is_train
1,11608,1,Ph.D,N,1,1,True
2,18701,1,Ph.D,Y,1,0,True
3,11283,1,Master,N,2,1,True
4,11767,1,Ph.D,N,1,1,True
5,20872,2,Master,Y,2,0,True
6,11772,2,Master,N,2,1,True
7,10535,2,Bachelor,N,0,1,True
8,12195,2,Ph.D,N,1,1,True
10,14975,3,Bachelor,Y,0,0,True
11,21371,3,Master,Y,2,0,True


In [14]:
test

Unnamed: 0,salary,experience,education,management,edu_code,manag_code,is_train
0,13876,1,Bachelor,Y,0,0,False
9,12313,3,Master,N,2,1,False
25,14803,8,Master,N,2,1,False
26,17404,8,Bachelor,Y,0,0,False
31,23174,10,Ph.D,Y,1,0,False
35,16882,12,Master,N,2,1,False
37,15990,13,Bachelor,N,0,1,False
39,17949,14,Master,N,2,1,False
41,27837,16,Master,Y,2,0,False
44,19207,17,Master,N,2,1,False


In [18]:
# preprocessing the data
# creating a list of the feature columns' names
features = df.columns[1], df.columns[4], df.columns[5]
features = pd.Index(features)
features

Index(['experience', 'edu_code', 'manag_code'], dtype='object')

In [19]:
# taking all the salaries into a nd array
sal = np.array(train.salary)
sal

array([11608, 18701, 11283, 11767, 20872, 11772, 10535, 12195, 14975,
       21371, 19800, 11417, 20263, 13231, 12884, 13245, 13677, 15965,
       12336, 21352, 13839, 22884, 16978, 22184, 13548, 14467, 15942,
       23780, 25410, 14861, 24170, 26330, 25685, 18838, 17483, 19346],
      dtype=int64)

In [20]:
# training the random forest classifier
# creating a random forest classifier
clf = RandomForestClassifier(n_jobs = 2, random_state = 0)
clf.fit(train[features], sal)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [21]:
# predicting salaries from the test data
pred = clf.predict(test[features])
pred

array([14975, 11772, 13839, 16978, 22184, 15942, 14861, 18838, 26330,
       18838], dtype=int64)

In [22]:
test

Unnamed: 0,salary,experience,education,management,edu_code,manag_code,is_train
0,13876,1,Bachelor,Y,0,0,False
9,12313,3,Master,N,2,1,False
25,14803,8,Master,N,2,1,False
26,17404,8,Bachelor,Y,0,0,False
31,23174,10,Ph.D,Y,1,0,False
35,16882,12,Master,N,2,1,False
37,15990,13,Bachelor,N,0,1,False
39,17949,14,Master,N,2,1,False
41,27837,16,Master,Y,2,0,False
44,19207,17,Master,N,2,1,False


In [23]:
# view the predicted probabilties
clf.predict_proba(test[features])

array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0.4, 0. , 0.2, 0.1, 0. , 0.2, 0. , 0. , 0. , 0. ,
        0. , 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.1, 0. , 0. , 0. , 0.6, 0.1, 0. , 0.2, 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0. , 0. ,
        0.8, 0. , 0. , 0. , 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0.2, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0.3, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0.2, 0. , 0.5, 0. , 0

In [24]:
# creating the confusion matrix
pd.crosstab(test['salary'], pred, rownames=['Actual Salary'], colnames=['Predicted Salary'])

Predicted Salary,11772,13839,14861,14975,15942,16978,18838,22184,26330
Actual Salary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12313,1,0,0,0,0,0,0,0,0
13876,0,0,0,1,0,0,0,0,0
14803,0,1,0,0,0,0,0,0,0
15990,0,0,1,0,0,0,0,0,0
16882,0,0,0,0,1,0,0,0,0
17404,0,0,0,0,0,1,0,0,0
17949,0,0,0,0,0,0,1,0,0
19207,0,0,0,0,0,0,1,0,0
23174,0,0,0,0,0,0,0,1,0
27837,0,0,0,0,0,0,0,0,1


In [63]:
# view feature importance 
# view a list of the features and there important score
list(zip(train[features], clf.feature_importances_))

[('experience', 0.69803317491940275),
 ('edu_code', 0.191381834851925),
 ('manag_code', 0.11058499022867226)]

In [1]:
import pandas as pd
import numpy as np


In [15]:
size = pd.Series(np.random.normal(loc=175,size=20,scale=5))  
# loc -> Mean (“centre”) of the distribution.
# scale -> Standard deviation (spread or “width”) of the distribution. Must be non-negative.
# print(size)
sizelist = list(size)
print ("\n",sizelist,max(sizelist),min(sizelist),np.std(sizelist),max(sizelist)-175,175-min(sizelist))


 [176.74062282315344, 179.8355729567487, 181.02582049607295, 175.37215746000865, 176.69284557995084, 172.73018184690085, 177.88895866080782, 181.4613908858415, 175.45023550289125, 178.28637293061726, 174.50202499841654, 175.58989796788822, 169.3629831606906, 176.9024277905338, 177.49230305312656, 169.81897469108085, 184.22947748942096, 178.29187609134888, 182.92219892909048, 175.02657110110297] 184.22947748942096 169.3629831606906 3.7503777475397153 9.229477489420958 5.63701683930941
