In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [3]:
#loading the dataset
df = pd.read_csv(r'updated_job_dataset (1).csv')


In [4]:
# Accessing first 5 rows
df.head()

Unnamed: 0,education,experience,industry,skills,jobtitle,location,salary_range,work_hours,company_size,job_type,job_level,remote_option,salary_midpoint
0,Master,5-7 years,Healthcare,Human Resources,Project Manager,"Seattle, WA",40k-60k,Part-time,501-1000,Business Analyst,Mid-Level,Remote,139616
1,PhD,<1 year,Education,Accounting,Project Manager,"Boston, MA",80k-100k,Part-time,51-200,Software Engineer,Mid-Level,Remote,41064
2,Bachelor,5-7 years,Government,Programming,Data Analyst,"San Francisco, CA",60k-80k,Part-time,11-50,Business Analyst,Senior-Level,Remote,106304
3,PhD,7+ years,Healthcare,Accounting,Accountant,"Seattle, WA",100k-120k,Part-time,1-10,Research Scientist,Mid-Level,Remote,73854
4,PhD,3-5 years,Technology,Project Management,Data Analyst,"Seattle, WA",80k-100k,Part-time,51-200,Project Manager,Mid-Level,On-site,42536


In [5]:
#Accessing last 5 rows
df.tail()

Unnamed: 0,education,experience,industry,skills,jobtitle,location,salary_range,work_hours,company_size,job_type,job_level,remote_option,salary_midpoint
11995,Associate,1-3 years,Retail,Sales & Marketing,Accountant,"New York, NY",60k-80k,Full-time,51-200,Healthcare Professional,Entry-Level,Remote,137436
11996,PhD,3-5 years,Healthcare,Programming,HR Specialist,"Austin, TX",60k-80k,Full-time,11-50,Project Manager,Entry-Level,On-site,96468
11997,Associate,<1 year,Retail,Accounting,Project Manager,"Boston, MA",120k-140k,Full-time,1001-5000,Product Manager,Entry-Level,Remote,100191
11998,PhD,5-7 years,Finance,Human Resources,Project Manager,"Boston, MA",120k-140k,Part-time,5000+,Sales Associate,Entry-Level,On-site,68287
11999,Associate,5-7 years,Education,Project Management,Project Manager,"San Francisco, CA",60k-80k,Full-time,501-1000,Marketing Specialist,Mid-Level,Remote,82981


In [6]:
# finding NaN values
df.isna().sum()

education          0
experience         0
industry           0
skills             0
jobtitle           0
location           0
salary_range       0
work_hours         0
company_size       0
job_type           0
job_level          0
remote_option      0
salary_midpoint    0
dtype: int64

In [7]:
#finding datatypes
df.dtypes

education          object
experience         object
industry           object
skills             object
jobtitle           object
location           object
salary_range       object
work_hours         object
company_size       object
job_type           object
job_level          object
remote_option      object
salary_midpoint     int64
dtype: object

In [8]:
df['job_type'].value_counts()   

job_type
Product Manager                    848
Software Engineer                  841
Research Scientist                 835
Business Analyst                   822
Project Manager                    813
Healthcare Professional            801
Network Administrator              801
Marketing Specialist               800
Education Professional             796
Sales Associate                    789
Accountant                         785
Data Analyst                       780
HR Specialist                      768
Customer Support Representative    761
Engineer (Non-Software)            760
Name: count, dtype: int64

In [9]:
# using the Labelencoder to convert one datatype to another datatype
le = LabelEncoder()
for i in df.columns:
    if df[i].dtypes == 'object':
        df[i] = le.fit_transform(df[i])
print(df)

       education  experience  industry  skills  jobtitle  location  \
0              3           2         3       3         4         4   
1              4           4         0       0         4         1   
2              1           2         2       4         2         3   
3              4           3         3       0         0         4   
4              4           1         6       5         2         4   
...          ...         ...       ...     ...       ...       ...   
11995          0           0         5       6         0         2   
11996          4           1         3       4         3         0   
11997          0           4         5       0         4         1   
11998          4           2         1       3         4         1   
11999          0           2         0       5         4         3   

       salary_range  work_hours  company_size  job_type  job_level  \
0                 2           1             5         1          1   
1                 4

In [10]:
# columns
df.columns

Index(['education', 'experience', 'industry', 'skills', 'jobtitle', 'location',
       'salary_range', 'work_hours', 'company_size', 'job_type', 'job_level',
       'remote_option', 'salary_midpoint'],
      dtype='object')

In [11]:
df['job_type'].value_counts()   

job_type
10    848
14    841
12    835
1     822
11    813
7     801
9     801
8     800
4     796
13    789
0     785
3     780
6     768
2     761
5     760
Name: count, dtype: int64

In [12]:
# splitting the data
input = df.drop('job_type',axis = 1)
output = df['job_type']

In [13]:
x_train,x_test,y_train,y_test = train_test_split(input,output,test_size=0.3,random_state=150)


In [14]:
x_train

Unnamed: 0,education,experience,industry,skills,jobtitle,location,salary_range,work_hours,company_size,job_level,remote_option,salary_midpoint
7287,4,4,5,6,3,1,4,0,2,0,1,43606
10970,1,1,5,1,1,0,3,0,3,2,1,87205
4305,1,3,5,0,2,2,2,0,2,1,0,50374
1697,4,3,5,3,0,0,1,0,0,1,1,97299
9312,1,4,5,3,5,4,2,1,2,1,0,88706
...,...,...,...,...,...,...,...,...,...,...,...,...
2354,1,3,2,4,5,3,4,1,5,1,1,83229
496,1,2,3,2,6,0,1,1,0,2,0,108289
25,1,3,3,0,3,3,1,0,5,2,1,40159
11771,0,4,0,6,5,1,3,0,6,2,1,129992


In [15]:
x_test

Unnamed: 0,education,experience,industry,skills,jobtitle,location,salary_range,work_hours,company_size,job_level,remote_option,salary_midpoint
11363,2,4,1,2,0,4,1,1,5,0,0,103651
2675,2,3,3,1,5,0,3,0,5,1,1,64191
9668,2,0,6,3,5,4,4,0,0,0,1,122897
159,1,3,3,3,1,3,3,1,4,2,1,68095
3033,3,4,5,3,5,3,4,1,5,0,0,52406
...,...,...,...,...,...,...,...,...,...,...,...,...
9135,4,4,3,4,5,3,4,0,5,2,1,118392
4599,2,4,4,3,2,4,2,1,5,1,0,68832
4463,3,3,6,3,5,0,0,0,0,0,0,60715
911,1,3,6,6,6,2,0,0,3,2,0,110388


In [16]:
y_train

7287     11
10970    10
4305      0
1697      8
9312     13
         ..
2354      9
496       6
25        6
11771    11
10682     0
Name: job_type, Length: 8400, dtype: int32

In [17]:
y_test

11363    14
2675      7
9668     11
159      10
3033      7
         ..
9135      3
4599      8
4463     10
911      11
9891      5
Name: job_type, Length: 3600, dtype: int32

In [18]:
# model Training or algorithm
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
df_job = DecisionTreeClassifier()
df_job.fit(x_train,y_train)

In [19]:
y_pred = df_job.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print(acc)

0.06888888888888889


In [20]:
import pickle
filename = 'Decision_job.sav'
pickle.dump(df_job, open(filename, 'wb'))

In [21]:
model = pickle.load(open(filename, 'rb'))

In [22]:
# confusion matrix
confusion_matrix(y_test,y_pred)

array([[19, 13, 13, 19, 16, 16, 17, 12,  9, 15, 15, 14, 18, 13, 14],
       [27, 19, 15, 20, 16, 11, 13, 17, 16, 15, 18, 15, 17, 13, 14],
       [11, 17, 16, 16, 13,  8,  9, 14, 17, 10, 12, 15, 14, 14, 13],
       [13, 13, 10, 20, 15, 10, 18, 22,  5, 13, 17, 16, 10, 14, 16],
       [14, 13, 17, 15, 23, 15, 18, 16, 10, 15, 19, 14, 26, 13, 14],
       [12, 21, 19, 12, 15, 12, 17,  7, 21, 13, 17, 12,  9, 23, 15],
       [12, 21, 13, 21, 21,  9, 18, 12, 11, 12, 25, 10,  7, 10, 13],
       [17, 15, 12, 15, 15, 15, 15, 10, 17, 17, 26, 14, 30, 10, 19],
       [ 7, 16, 20, 10, 15, 18, 11, 21, 12, 15, 15, 25, 11, 15, 18],
       [15, 21, 22, 19, 15, 17, 14, 11, 16, 22, 14, 13, 19, 22, 14],
       [29, 12, 18, 22, 18, 19, 18, 26, 17, 22, 11, 19, 23, 16, 19],
       [24, 22, 22, 19, 11, 16, 21, 14, 25, 27, 17, 16,  8,  9, 15],
       [13, 15, 15, 17, 14, 10, 22, 21, 16, 23, 15, 13, 16, 17, 16],
       [27, 13, 14, 15, 18, 13, 17, 14, 13, 19, 13, 22, 12, 16, 16],
       [22, 12, 15, 20, 21, 18, 14

In [23]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train,y_train)

In [24]:
y_pred = rf.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print(acc)

0.06472222222222222


In [25]:
# confusion matrix 
confusion_matrix(y_test,y_pred)

array([[15, 21, 17, 14, 18, 14, 14, 13, 17, 16, 12, 15, 14,  8, 15],
       [26, 26, 15, 22, 14, 18, 17, 19, 10, 12, 12, 18, 10, 10, 17],
       [12, 17,  8, 25, 14, 12,  9,  7, 18, 15, 15, 14, 14,  8, 11],
       [16, 17, 12, 19, 14,  9, 16, 19, 16,  9,  8, 13, 18, 15, 11],
       [15, 18, 19, 19, 14, 14, 20, 11, 16, 16, 19, 12, 22, 13, 14],
       [15, 27, 11, 16, 20, 14, 16, 15, 11, 10, 14, 16, 11, 14, 15],
       [11, 19, 25, 16, 11, 11, 16, 12, 14, 15, 12, 12, 19, 11, 11],
       [16, 12, 16, 22, 12, 20, 17,  9, 21, 15, 15, 13, 21, 16, 22],
       [ 9, 17, 15, 18, 12, 15, 16, 15, 17, 18, 15, 18, 17, 15, 12],
       [16, 26, 20, 20, 10, 15, 12, 13, 18, 21, 21, 18, 21,  9, 14],
       [19, 24, 21, 20, 20, 16, 20, 19, 16, 20, 15, 21, 18, 17, 23],
       [22, 19, 16, 25, 16, 20, 17, 23, 16, 15, 19, 14, 17, 10, 17],
       [17, 13, 22, 18, 15, 14, 15, 15, 16, 17, 20, 15, 19, 13, 14],
       [23, 23, 11, 14, 17, 10, 25, 18, 22, 17, 11, 14, 10,  9, 18],
       [16, 18, 19, 14, 20, 23, 15

In [26]:
import pickle
filename = 'random_forest.sav'
pickle.dump(df_job, open(filename, 'wb'))

In [27]:
# prediction
abc=[4,1,1,4,6,3,1,0,2,0,0,127352]
result=rf.predict([abc])
result=result[0]
if result == 10:
    print('Product Manager')
elif result == 14:
    print('software Engineer')
elif result == 12:
    print('Research Scientist')
elif result == 1:
    print('Business Analyst ')
elif result == 11:
    print('Project Manager')
elif result == 7:
    print('Healthcare Professional')
elif result == 9 :
    print('Network Administrator')
elif result == 8 :
    print('Marketing Specialist ')
elif result == 4:
    print('Education Professional')
elif result == 13:
    print('Sales Associate ')
elif result == 0:
    print('Accountant ')
elif result ==3 :
    print('Data Analyst')
elif result ==6 :
    print('HR Specialist')
elif result == 2:
    print('Customer Support Representative')
elif result == 5:
    print('Engineer (Non-Software)')

Sales Associate 


