In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [3]:
path=r"Predict-The-Data-Scientists-Salary-In-India_Train_Dataset.csv"
df = pd.read_csv(path)

print(df.head())

   Unnamed: 0 experience                                    job_description  \
0           0    5-7 yrs  Exp: Minimum 5 years;Good understanding of IOC...   
1           1  10-17 yrs  He should have handled a team of atleast 5-6 d...   
2           2    5-9 yrs  Must be an effective communicator (written & s...   
3           3   7-10 yrs  7  -  10 years of overall experience in data e...   
4           4    1-3 yrs  Chartered Accountancy degree or MBA in Finance...   

                                           job_desig   job_type  \
0        Senior Exploit and Vulnerability Researcher        NaN   
1                                           Head SCM        NaN   
2  Deputy Manager - Talent Management & Leadershi...  Analytics   
3                 Associate Manager Data Engineering  Analytics   
4                            TS- GSA- Senior Analyst        NaN   

                                          key_skills               location  \
0  team skills, communication skills, analy

In [4]:
print(df.columns)

Index(['Unnamed: 0', 'experience', 'job_description', 'job_desig', 'job_type',
       'key_skills', 'location', 'salary', 'company_name_encoded'],
      dtype='object')


In [5]:
df['experience']

0          5-7 yrs
1        10-17 yrs
2          5-9 yrs
3         7-10 yrs
4          1-3 yrs
           ...    
19797    12-18 yrs
19798      0-3 yrs
19799     8-13 yrs
19800      1-3 yrs
19801      2-5 yrs
Name: experience, Length: 19802, dtype: object

In [6]:
df[['min_exp', 'max_exp']] = df['experience'].str.split('-', expand=True)
df['min_exp'] = pd.to_numeric(df['min_exp'], errors='coerce')
df['max_exp'] = pd.to_numeric(df['max_exp'], errors='coerce')

In [7]:
le_location = LabelEncoder()
le_salary = LabelEncoder()

df['Location_encoded'] = le_location.fit_transform(df['location'])
df['Salary_encoded'] = le_salary.fit_transform(df['salary'])


X = df[['company_name_encoded',  'Location_encoded']]
y = df['Salary_encoded']

In [8]:
df.columns

Index(['Unnamed: 0', 'experience', 'job_description', 'job_desig', 'job_type',
       'key_skills', 'location', 'salary', 'company_name_encoded', 'min_exp',
       'max_exp', 'Location_encoded', 'Salary_encoded'],
      dtype='object')

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100),
    "AdaBoost": AdaBoostClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100)
}

In [11]:
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[name] = acc

print("Accuracy scores:\n", results)

Accuracy scores:
 {'Decision Tree': 0.35521332996718, 'Random Forest': 0.3557182529664226, 'Extra Trees': 0.36076748295884875, 'AdaBoost': 0.2638222671042666, 'Gradient Boosting': 0.3337541024993688}


In [12]:
best_model_name = max(results, key=results.get)
print("Best Model:", best_model_name)

Best Model: Extra Trees
