In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix,accuracy_score

In [None]:
data = pd.read_csv('prediction-data.csv')

In [None]:
## Shows the first 5 rows of the dataframe
data.head()

: 

In [None]:
## Number of rows and columns
data.shape

In [None]:
## Getting some informations about the dataset
data.info()

In [None]:
## Checking for missing values
data.isnull().sum(axis=0)

No missing values in the dataset

In [None]:
## statistical Measures of the dataset
data.describe()

In [None]:
print("List of Numerical features: \n" , data.select_dtypes(include=np.number).columns.tolist())
print("\n\nList of Categorical features: \n" , data.select_dtypes(include=['object']).columns.tolist())

### Visualization for Categorical Variables

In [None]:
print(data["Logical quotient rating"].value_counts())

In [None]:
print(data["hackathons"].value_counts())

In [None]:
print(data["coding skills rating"].value_counts())

In [None]:
print(data["public speaking points"].value_counts())

In [None]:
print(data["interested career area "].value_counts())

In [None]:
print(data['Suggested Job Role'].value_counts())

In [None]:
print(data.columns)

# **Feature Engineering**

### Encoding the categorical features

In [None]:
## encoding "self-learning capability?" column.
data.replace({'self-learning capability?':{'yes':1,'no':0}},inplace=True)



## encoding "Extra-courses did" column.
data.replace({'Extra-courses did':{'yes':1,'no':0}},inplace=True)



## encoding "certifications" column.
data.replace({'certifications':{'r programming' : 0,
'information security': 1,
'shell programming' : 2,
'machine learning' : 3,
'full stack': 4,
'hadoop' : 5,
'python' : 6,
'distro making' : 7,
'app development' : 8
}},inplace=True)



## encoding "workshops" column.
data.replace({'workshops':{'database security' :0,
'system designing' : 1,
'web technologies' : 2,
'hacking' : 3,
'testing' : 4,
'data science' : 5,
'game development' : 6,
'cloud computing' : 7
}},inplace=True)



## encoding "reading and writing skills" column.
data.replace({'reading and writing skills':{'poor':0,'medium':1, 'excellent':2}},inplace=True)



## encoding "memory capability score" column.
data.replace({'memory capability score':{'poor':0,'medium':1, 'excellent':2}},inplace=True)



## encoding "Interested subjects" column.
data.replace({'Interested subjects':{'Software Engineering': 0, 'IOT': 1, 'cloud computing': 2, 'programming': 3, 'networks': 4,
    'Computer Architecture': 5, 'data engineering': 6, 'hacking': 7, 'Management': 8, 'parallel computing': 9}},inplace=True)



## encoding "interested career area" column.
data.replace({'interested career area ':{'system developer': 0, 'security': 1, 'Business process analyst': 2, 'developer': 3, 'testing': 4,
    'cloud computing': 5}},inplace=True)



## encoding "Type of company want to settle in?" column.
data.replace({'Type of company want to settle in?':{'Service Based': 0, 'Web Services': 1, 'BPA': 2, 'Testing and Maintainance Services': 3,
    'Product based': 4, 'Finance': 5, 'Cloud Services': 6, 'product development': 7,
    'Sales and Marketing': 8, 'SAaS services': 9}},inplace=True)



## encoding "Taken inputs from seniors or elders" column.
data.replace({'Taken inputs from seniors or elders':{'yes': 1, 'no': 0}},inplace=True)



## encoding "Interested Type of Books" column.
data.replace({'Interested Type of Books':{'Guide': 0, 'Health': 1, 'Self help': 2, 'Horror': 3, 'Biographies': 4, 'Science fiction': 5,
    'Satire': 6, 'Childrens': 7, 'Autobiographies': 8, 'Prayer books': 9, 'Fantasy': 10, 'Journals': 11,
    'Trilogy': 12, 'Anthology': 13, 'Encyclopedias': 14, 'Drama': 15, 'Mystery': 16, 'History': 17,
    'Science': 18, 'Dictionaries': 19, 'Diaries': 20, 'Religion-Spirituality': 21, 'Action and Adventure': 22,
    'Poetry': 23, 'Cookbooks': 24, 'Comics': 25, 'Art': 26, 'Travel': 27, 'Series': 28, 'Math': 29, 'Romance': 30}},inplace=True)



## encoding "Management or Technical" column.
data.replace({'Management or Technical':{'Management': 0, 'Technical': 1}},inplace=True)



## encoding "hard/smart worker" column.
data.replace({'hard/smart worker':{'hard worker': 0, 'smart worker': 1}},inplace=True)



## encoding "worked in teams ever?" column.
data.replace({'worked in teams ever?':{'yes': 1, 'no': 0}},inplace=True)



## encoding "Introvert" column.
data.replace({'Introvert':{'yes': 1, 'no': 0}},inplace=True)


In [None]:
X = data.drop(columns='Suggested Job Role', axis=1)
Y = data['Suggested Job Role']

### Head Values

In [None]:
data.head()

### Model Training


In [None]:
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Decision Tree

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, f1_score, accuracy_score, confusion_matrix

# Define and train the ANN model
ann_model = MLPClassifier(hidden_layer_sizes=(100, 100), activation='relu', solver='adam', random_state=1, max_iter=500)
ann_model.fit(X_train, Y_train)

# Predict on the test set
Y_pred = ann_model.predict(X_test)

# Evaluate the model
print("Precision of ANN Classifier: ", precision_score(Y_test, Y_pred, average='weighted'))
print("  ")

print("F1-Score of ANN Classifier: ", f1_score(Y_test, Y_pred, average='weighted'))
print("  ")

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy Score of ANN Classifier: ", accuracy * 100)
print("  ")

cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix = ")
print(cm)


In [None]:
# userdata = [['5','0','6','2','1','0','1','4','0','0','3','4','2','0','28','0','1','1','0']]
userdata = [[5,0,6,2,1,0,1,4,0,0,3,4,2,0,28,0,1,1,0]]
ynewclass = ann_model.predict(userdata)
ynew = ann_model.predict_proba(userdata)
print("Prediction :",ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

Prediction : ['Applications Developer']
Probabilities of all classes:  [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Probability of Predicted class :  1.0




### SVM

In [None]:
from sklearn import svm

svm = svm.SVC()
svm.fit(X_train, Y_train)

svm_y_pred = svm.predict(X_test)

print("Precision of SVM Classifier: ",precision_score(Y_test, svm_y_pred, average='weighted'))
print("  ")

print("F1-Score of SVM Classifier: ",f1_score(Y_test, svm_y_pred, average='weighted'))
print("  ")

svm_accuracy = accuracy_score(Y_test,svm_y_pred)
print("Accuracy Score of SVM Classifier: ",svm_accuracy*10)
print("  ")


svm_cm = confusion_matrix(Y_test,svm_y_pred)

print("confusion matrics : ")
print(svm_cm)



In [None]:
userdata = [['5','0','6','2','1','0','1','4','0','0','3','4','2','0','28','0','1','1','0']]
ynewclass = svm.predict(userdata)
ynew = svm.decision_function(userdata)
print("Prediction :",ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 10)
rf.fit(X_train, Y_train)
rfc_y_pred = rf.predict(X_test)

print("Precision of RandomForest Classifier: ",precision_score(Y_test, rfc_y_pred, average='weighted'))
print("  ")

print("F1-Score of RandomForest Classifier: ",f1_score(Y_test, rfc_y_pred, average='weighted'))
print("  ")

rfc_accuracy = accuracy_score(Y_test,rfc_y_pred)
print("Accuracy of RandomForest Classifier: ",rfc_accuracy*10)
print("  ")

rfc_cm = confusion_matrix(Y_test,rfc_y_pred)

print("confusion matrics=")
print(rfc_cm)

In [None]:
userdata = [['5','0','6','2','1','0','1','4','0','0','3','4','2','0','28','0','1','1','0']]
ynewclass = rf.predict(userdata)
ynew = rf.predict_proba(userdata)
print("Prediction :",ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

### Create a .pkl file using Decision Tree

In [None]:
import pickle
pickle.dump(ann_model,open('dtmodel.pkl','wb'))

### Create a .pkl file using Random Forest

In [None]:
import pickle
pickle.dump(rf,open('rfmodel.pkl','wb'))

In [None]:
##print(sklearn.__version__)
##print(pd.__version__)