In [5]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import re

# Load Kaggle Survey 
df = pd.read_csv('/Users/medhavadlamaani/Downloads/kaggle_survey_2022_responses (1).csv', low_memory=False)

df = df.drop(0).reset_index(drop=True)

df = df[['Q2', 'Q4', 'Q8', 'Q11', 'Q12_1', 'Q12_5', 'Q12_7', 'Q12_10', 'Q29']]  # Age, Country, Education, YearsCoding, Python, SQL, Java, Go, Salary

df = df.rename(columns={
    'Q2': 'Age',
    'Q4': 'Country',
    'Q8': 'Education',
    'Q11': 'YearsCoding',
    'Q12_1': 'Python',
    'Q12_5': 'SQL',
    'Q12_7': 'Java',
    'Q12_10': 'Go',
    'Q29': 'Salary'
})

# Cleaning Salary 
def convert_salary(value):
    if pd.isnull(value):
        return np.nan
    value = str(value).replace('$', '').replace(',', '').strip()
    if value.startswith('>'):
        num = re.findall(r'\d+', value)
        if num:
            base = int(num[0])
            return base * 1.25
        else:
            return np.nan
    if '-' in value:
        parts = value.split('-')
        try:
            low = int(parts[0])
            high = int(parts[1])
            return (low + high) / 2
        except:
            return np.nan
    try:
        return float(value)
    except:
        return np.nan

df['Salary'] = df['Salary'].apply(convert_salary)
df = df[df['Salary'].notnull()]
df = df[df['Salary'] != 0]

# Mapping Education to numbers
education_mapping = {
    'No formal education past high school': 0,
    'Some college/university study without earning a bachelor’s degree': 1,
    'Bachelor’s degree': 1,
    'Master’s degree': 2,
    'Doctoral degree': 3,
    'Professional doctorate': 3
}
df['Education'] = df['Education'].map(education_mapping).fillna(1)  

# Simplify Country
def simplify_country(x):
    if x == 'United States of America':
        return 'United States'
    elif x == 'United Kingdom of Great Britain and Northern Ireland':
        return 'United Kingdom'
    elif x == 'Germany':
        return 'Germany'
    else:
        return 'Other'

df['Country'] = df['Country'].apply(simplify_country)

# Clean skills
df['Python'] = df['Python'].notnull().astype(int)
df['SQL'] = df['SQL'].notnull().astype(int)
df['Java'] = df['Java'].notnull().astype(int)
df['Go'] = df['Go'].notnull().astype(int)

# Clean YearsCoding
coding_mapping = {
    'I have never written code': 0,
    '< 1 years': 0.5,
    '1-2 years': 1.5,
    '3-5 years': 4,
    '5-10 years': 7.5,
    '10-20 years': 15,
    '20+ years': 25
}
df['YearsCoding'] = df['YearsCoding'].map(coding_mapping).fillna(4)

df = pd.get_dummies(df, columns=['Country'], drop_first=True)

# Ensure all needed country columns exist
for col in ['Country_Germany', 'Country_United Kingdom', 'Country_United States']:
    if col not in df.columns:
        df[col] = 0

# Final feature set
feature_cols = [
    'Education', 'YearsCoding', 'Java', 'Python', 'SQL', 'Go',
    'Country_Germany', 'Country_United Kingdom', 'Country_United States'
]

X = df[feature_cols]
y = df['Salary']

# Train Random Forest Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# .pkl file
with open('salary2025_model.pkl', 'wb') as f:
    pickle.dump(rf, f)

print(" Random Forest model trained and saved as salary2025_model.pkl")


 Random Forest model trained and saved as salary2025_model.pkl
