# **Support Vector Regression**

# **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np


# **Loading Dataset**

In [None]:
df = pd.read_csv("/content/Salary_Data_Based_country_and_race.csv")

In [None]:
df.head()

# **Data Cleaning**

In [None]:

# 2.3 Cleaning: drop duplicates, trim whitespace
df = df.drop_duplicates().apply(lambda s: s.str.strip() if s.dtype == "object" else s)
df = df.dropna(subset=["Salary"]).reset_index(drop=True)

In [None]:
df.head()

In [None]:
# 3.1 Rename columns for consistency
df = df.rename(columns={
    "Education Level": "Education",
    "Years of Experience": "Experience"
})
df.columns


# **EDA and Visulaization**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 4.1 Distribution of Salary
sns.histplot(df["Salary"], kde=True)
plt.title("Salary Distribution")
plt.show()

# 4.2 Salary vs. Experience
# sns.scatterplot(x="Experience", y="Salary", data=df)
# plt.title("Experience vs Salary")
# plt.show()


In [None]:
sns.scatterplot(x="Experience", y="Salary", data=df)
plt.title("Experience vs Salary")
plt.show()

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# ohe = OneHotEncoder(drop="first", sparse_output=False)
# X_cat = pd.DataFrame(
#     ohe.fit_transform(df[cat_cols]),
#     columns=ohe.get_feature_names_out(cat_cols),
#     index=df.index
# )


# **Feature Engineering**


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Assume df is already your cleaned DataFrame

# 1. Define numeric features
X_num = df[["Age", "Experience"]]

# 2. One-hot encode categorical features
cat_cols = ["Gender", "Education", "Job Title", "Country", "Race"]
ohe = OneHotEncoder(drop="first", sparse_output=False)
X_cat = pd.DataFrame(
    ohe.fit_transform(df[cat_cols]),
    columns=ohe.get_feature_names_out(cat_cols),
    index=df.index
)

# 3. Combine into a single feature matrix
X = pd.concat([X_num, X_cat], axis=1)

# 4. Impute missing numeric values
imp = SimpleImputer(strategy="median")
X_num_imputed = pd.DataFrame(
    imp.fit_transform(X_num),
    columns=X_num.columns,
    index=df.index
)

# 5. Update X with the imputed numeric values
X.update(X_num_imputed)

# **Normalisation and Standardisation**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
y = df["Salary"]


#**Spliting Data in to Train and Test**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [None]:
!pip install optuna
import optuna
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Optuna objective function
def objective(trial):
    C = trial.suggest_loguniform('C', 1e-2, 1e3)
    epsilon = trial.suggest_loguniform('epsilon', 1e-2, 10)
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

    svr = SVR(kernel='rbf', C=C, epsilon=epsilon, gamma=gamma)
    model = make_pipeline(StandardScaler(), svr)

    score = cross_val_score(model, X_train, y_train, cv=2, scoring='neg_mean_squared_error')
    return -score.mean()

# Run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best model training
best_params = study.best_trial.params
print("Best params from Optuna:", best_params)

best_model = make_pipeline(StandardScaler(), SVR(**best_params))
best_model.fit(X_train, y_train)

# Evaluate
y_pred = best_model.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, y_pred))


# **Evaluation**

In [None]:
# from sklearn.svm import SVR
# from sklearn.model_selection import GridSearchCV

# # 9.1 Hyperparameter grid
# param_grid = {
#     "C": [1, 10, 100],
#     "epsilon": [0.1, 1, 5],
#     "gamma": ["scale", "auto"]
# }

# # 9.2 Grid search on SVR
# svr = SVR(kernel="rbf")
# grid = GridSearchCV(svr, param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
# grid.fit(X_train, y_train)

# print("Best params:", grid.best_params_)
# model = grid.best_estimator_




# ####################
# param_grid = {
#     "C": [0.1, 1, 10, 100, 1000],
#     "epsilon": [0.01, 0.1, 0.5, 1, 5, 10],
#     "gamma": ["scale", "auto", 0.001, 0.01, 0.1, 1]
# }

# # 9.2 Grid search on SVR
# svr = SVR(kernel="rbf")
# grid = GridSearchCV(svr, param_grid, cv=2, scoring="neg_mean_squared_error", n_jobs=-1, verbose=2)
# grid.fit(X_train, y_train)

# print("Best params:", grid.best_params_)
# model = grid.best_estimator_
# #########################

# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform

# param_dist = {
#     "C": uniform(0.1, 1000),
#     "epsilon": uniform(0.01, 10),
#     "gamma": ["scale", "auto"]
# }

# svr = SVR(kernel="rbf")
# rand_search = RandomizedSearchCV(svr, param_distributions=param_dist, n_iter=50,cv=5, scoring="neg_mean_squared_error", n_jobs=-1, random_state=42)
# rand_search.fit(X_train, y_train)
# print("Best params:", rand_search.best_params_)


In [None]:
# from sklearn.metrics import mean_squared_error, r2_score

# y_pred = model.predict(X_test)
# print("MSE:", mean_squared_error(y_test, y_pred))
# print("R²:", r2_score(y_test, y_pred))

# # 10.1 Plot predictions vs. actual
# plt.scatter(y_test, y_pred, alpha=0.6)
# plt.plot([y.min(), y.max()], [y.min(), y.max()], "r--")
# plt.xlabel("Actual Salary")
# plt.ylabel("Predicted Salary")
# plt.title("SVR: Actual vs Predicted")
# plt.show()


# **Result Visualization**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import numpy as np
import matplotlib.pyplot as plt

# Using only Years of Experience for visualization
X_vis = df[["Experience"]].values
y_vis = df["Salary"].values.reshape(-1, 1)

# Feature scaling
sc_X = StandardScaler()
sc_y = StandardScaler()

X_scaled = sc_X.fit_transform(X_vis)
y_scaled = sc_y.fit_transform(y_vis).ravel()

# SVR model
regressor = SVR(kernel='rbf')
regressor.fit(X_scaled, y_scaled)

# High-res grid for smooth curve
X_grid = np.arange(min(X_scaled), max(X_scaled), 0.01).reshape(-1, 1)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(sc_X.inverse_transform(X_scaled), sc_y.inverse_transform(y_scaled.reshape(-1, 1)), color='red')
plt.plot(sc_X.inverse_transform(X_grid), sc_y.inverse_transform(regressor.predict(X_grid).reshape(-1, 1)), color='blue')
plt.title('SVR Model - Salary vs. Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.grid(True)
plt.show()
