# Train Salary Prediction Model for API

This notebook trains a Linear Regression model with the exact features your API expects and exports the model files.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import os


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\VICTOR HUGO\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\VICTOR HUGO\AppData\Roaming\Python\Python310\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\VI

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

: 

In [None]:
# Load the dataset - UPDATE THIS PATH to where your CSV is located
df = pd.read_csv(r'd:\Downloads\salary_data_cleaned.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## Prepare Features for API

The API expects these features:
- `rating` (float): Company rating (0-5)
- `age` (int): Company age in years
- `same_state` (int): 0 or 1
- `python_yn` (int): 0 or 1
- `R_yn` (int): 0 or 1
- `spark` (int): 0 or 1
- `aws` (int): 0 or 1
- `excel` (int): 0 or 1
- `job_simp` (str): Simplified job title (will be one-hot encoded)
- `seniority` (str): Seniority level (will be one-hot encoded)
- `desc_len` (int): Job description length
- `num_comp` (int): Number of competitors

In [None]:
# Create job_simp from Job Title
def simplify_job_title(title):
    title_lower = str(title).lower()
    if 'data scientist' in title_lower:
        return 'data scientist'
    elif 'data engineer' in title_lower:
        return 'data engineer'
    elif 'analyst' in title_lower:
        return 'analyst'
    elif 'machine learning' in title_lower or 'ml' in title_lower:
        return 'machine learning'
    elif 'manager' in title_lower:
        return 'manager'
    elif 'director' in title_lower:
        return 'director'
    else:
        return 'data scientist'

df['job_simp'] = df['Job Title'].apply(simplify_job_title)

# Create seniority from Job Title
def extract_seniority(title):
    title_lower = str(title).lower()
    if 'senior' in title_lower or 'sr' in title_lower:
        return 'senior'
    elif 'junior' in title_lower or 'jr' in title_lower:
        return 'junior'
    elif 'lead' in title_lower:
        return 'lead'
    elif 'principal' in title_lower:
        return 'principal'
    else:
        return 'na'

df['seniority'] = df['Job Title'].apply(extract_seniority)

# Create desc_len from Job Description
df['desc_len'] = df['Job Description'].apply(lambda x: len(str(x)))

# Create num_comp from Competitors
def count_competitors(comp):
    if pd.isna(comp) or comp == '-1' or comp == -1:
        return 0
    return len(str(comp).split(','))

df['num_comp'] = df['Competitors'].apply(count_competitors)

print("Created derived features:")
print(f"job_simp unique: {df['job_simp'].unique()}")
print(f"seniority unique: {df['seniority'].unique()}")

In [None]:
# Select features that match the API
feature_cols = ['Rating', 'age', 'same_state', 'python_yn', 'R_yn', 'spark', 'aws', 'excel', 'job_simp', 'seniority', 'desc_len', 'num_comp']

# Rename Rating to rating for consistency
df = df.rename(columns={'Rating': 'rating'})
feature_cols = ['rating', 'age', 'same_state', 'python_yn', 'R_yn', 'spark', 'aws', 'excel', 'job_simp', 'seniority', 'desc_len', 'num_comp']

# Prepare X and y
X = df[feature_cols].copy()
y = df['avg_salary']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
X.head()

In [None]:
# One-hot encode categorical columns (job_simp and seniority)
X_encoded = pd.get_dummies(X, columns=['job_simp', 'seniority'], drop_first=True)

print(f"Encoded features shape: {X_encoded.shape}")
print(f"Feature columns: {list(X_encoded.columns)}")
X_encoded.head()

## Train the Model

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate
y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"RÂ² Score: {r2:.4f}")
print(f"RMSE: ${rmse:.2f}K")

## Export Model Files for Railway API

We need to export:
1. `linear_regression.joblib` - The trained model
2. `feature_columns.joblib` - The list of feature column names (in correct order)

In [None]:
# Create models directory
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Save the trained model
model_path = os.path.join(models_dir, 'linear_regression.joblib')
joblib.dump(lr, model_path)
print(f"Model saved to: {model_path}")

# Save feature column names
feature_columns = list(X_encoded.columns)
columns_path = os.path.join(models_dir, 'feature_columns.joblib')
joblib.dump(feature_columns, columns_path)
print(f"Feature columns saved to: {columns_path}")

print(f"\nFeature columns ({len(feature_columns)}):")
for col in feature_columns:
    print(f"  - {col}")

## Test the Model

Let's test with sample input matching the API format:

In [None]:
# Test with sample API input
sample_input = {
    "rating": 3.5,
    "age": 10,
    "same_state": 1,
    "python_yn": 1,
    "R_yn": 0,
    "spark": 1,
    "aws": 1,
    "excel": 0,
    "job_simp": "data scientist",
    "seniority": "senior",
    "desc_len": 500,
    "num_comp": 3,
}

# Convert to DataFrame
test_df = pd.DataFrame([sample_input])

# One-hot encode
test_encoded = pd.get_dummies(test_df, columns=['job_simp', 'seniority'], drop_first=True)

# Align with training columns
for col in feature_columns:
    if col not in test_encoded.columns:
        test_encoded[col] = 0
test_encoded = test_encoded[feature_columns]

# Predict
prediction = lr.predict(test_encoded)
print(f"Sample prediction: ${prediction[0]:.2f}K")

## Next Steps

1. Run all cells in this notebook
2. Copy the files from the `models/` folder:
   - `linear_regression.joblib`
   - `feature_columns.joblib`
3. Upload them to your Railway repo's `models/` folder
4. Push to GitHub and Railway will auto-redeploy
5. Test the API!