In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_excel('C:/Users/Ananya/Downloads/recruitmentdataset.xlsx')

# Fill missing values in 'salary' with the median salary
median_salary = data['salary'].median()
data['salary'].fillna(median_salary, inplace=True)

# Selecting relevant features and target
features = data[['ssc_p', 'hsc_p', 'degree_p', 'workex', 'etest_p', 'mba_p']]
target = data['status']

# Convert categorical data to numeric using one-hot encoding
features = pd.get_dummies(features)

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Creating and training the decision tree model
model = DecisionTreeClassifier(max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the decision tree model:", accuracy)

# Displaying feature importance
importance = pd.DataFrame({'Feature': features.columns, 'Importance': model.feature_importances_})
top_features = importance.sort_values(by='Importance', ascending=False).head()
print("Top features contributing to the model predictions:")
print(top_features)


Accuracy of the decision tree model: 0.813953488372093
Top features contributing to the model predictions:
    Feature  Importance
0     ssc_p    0.569767
4     mba_p    0.200663
1     hsc_p    0.169747
2  degree_p    0.059823
3   etest_p    0.000000


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

data = pd.read_excel('C:/Users/Ananya/Downloads/recruitmentdataset.xlsx')

median_salary = data['salary'].median()
data['salary'].fillna(median_salary, inplace=True)

features = data[['ssc_p', 'hsc_p', 'degree_p', 'workex', 'etest_p', 'mba_p']]
target = data['status']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the random forest model:", accuracy)

importance = pd.DataFrame({'Feature': features.columns, 'Importance': model.feature_importances_})
top_features = importance.sort_values(by='Importance', ascending=False).head()
print("Top features contributing to the model predictions:")
print(top_features)


Accuracy of the random forest model: 0.7674418604651163
Top features contributing to the model predictions:
    Feature  Importance
0     ssc_p    0.339981
1     hsc_p    0.227594
2  degree_p    0.207188
4     mba_p    0.112551
3   etest_p    0.056969
