In [ ]:
import numpy as npimport pandas as pd

In [ ]:
# Step 1: Load Data
dataset = pd.read_csv('Dataset.csv')
dataset.head()

In [ ]:
# Step 1.1: Data Preprocessing - Handle missing values and outliers
print('Missing values in each column:')
print(dataset.isnull().sum())

# Fill missing values with mean (if any)
dataset = dataset.fillna(dataset.mean())

# Outlier detection and removal (using IQR method for Salary)
Q1 = dataset.iloc[:,1].quantile(0.25)
Q3 = dataset.iloc[:,1].quantile(0.75)
IQR = Q3 - Q1
filter = (dataset.iloc[:,1] >= Q1 - 1.5 * IQR) & (dataset.iloc[:,1] <= Q3 + 1.5 * IQR)
dataset = dataset.loc[filter]

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1:].values

In [ ]:
# Step 2: Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)

In [ ]:
print(dataset)

In [ ]:
# Step 3: Fit Simple Linear Regression to Training Data
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [ ]:
# Step 4: Make Prediction
y_pred = regressor.predict(X_test)
regressor.score(X_test, y_test)

In [ ]:
# Step 4.1: Model Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R²: {r2:.2f}')

In [ ]:
import joblib

In [ ]:
# save the model 
filename='final_model.sav'
joblib.dump(regressor, filename)

In [ ]:
# Step 5 - Visualize training set results
import matplotlib.pyplot as plt
# plot the actual data points of training set
plt.scatter(X_train, y_train, color = 'red')
# plot the regression line
plt.plot(X_train, regressor.predict(X_train), color='blue')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

In [ ]:
# Step 6 - Visualize test set results
import matplotlib.pyplot as plt
# plot the actual data points of test set
plt.scatter(X_test, y_test, color = 'red')
# plot the regression line (same as above)
plt.plot(X_train, regressor.predict(X_train), color='blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

In [ ]:
# Step 7 - Make new prediction
new_salary_pred = regressor.predict([[15]])
print('The predicted salary of a person with 15 years experience is ', new_salary_pred)

In [ ]:
# Step 7.1: Load the saved model and predict for user input
import joblib
loaded_model = joblib.load('final_model.sav')
try:
    years_exp = float(input('Enter years of experience: '))
    user_pred = loaded_model.predict([[years_exp]])
    print(f'Predicted salary for {years_exp} years of experience: {user_pred[0][0]:.2f}')
except Exception as e:
    print('Invalid input:', e)

In [ ]:
# Step 8: Residual Plot for Diagnostics
import matplotlib.pyplot as plt
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, color='purple')
plt.axhline(y=0, color='black', linestyle='--')
plt.title('Residual Plot')
plt.xlabel('Predicted Salary')
plt.ylabel('Residuals')
plt.show()