# **Employee Salary Prediction using Random Forest Technique**

Step 01: Loading and Inspecting the Dataset

In [None]:
import pandas as pd
import numpy as np

# Load the dataset from the provided URL
#The dataset used for this project is hosted by UCI Machine Learning Repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'salary'
]

df = pd.read_csv(url, header=None, names=columns, na_values=' ?', sep=',\s*', engine='python')

#Prints the first 5 rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

In [None]:
#Prints the Summary of the Dataset
print("\nSummary Statistics:")
print(df.describe())

Step 2: Data Cleaning and Handling Missing Values

In [None]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

In [None]:
#We will be filling missing values using the mode
for col in ['workclass', 'occupation', 'native-country']:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
# Verify that there are no more missing values
print("\nMissing values after cleaning:")
print(df.isnull().sum())

Step 3: Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sns

# Set the style for plots
sns.set_style("whitegrid")

In [None]:
import matplotlib.pyplot as plt

# 1. Salary Distribution (Target Variable)
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='salary')
plt.title('Distribution of Salary Classes')
plt.xlabel('Salary')
plt.ylabel('Count')
plt.show()

In [None]:
# 2. Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=30, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 3. Salary vs. Hours per Week
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='salary', y='hours-per-week')
plt.title('Salary vs. Hours per Week')
plt.xlabel('Salary')
plt.ylabel('Hours per Week')
plt.show()

In [None]:
# Convert the target variable 'salary' to binary (0 and 1)
df['salary'] = df['salary'].apply(lambda x: 1 if x == '>50K' else 0)

Step 4: Preprocessing the data

In [None]:
# One-Hot Encode categorical features
# This creates new columns for each category and marks presence with 1 or 0
categorical_cols = df.select_dtypes(include=['object']).columns
df_processed = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
# Display the first few rows of the processed data
print("Processed data with one-hot encoding:")
print(df_processed.head())

In [None]:
# The target variable is 'salary'. All other columns are features.
X = df_processed.drop('salary', axis=1)
y = df_processed['salary']

Step 5: Data Splitting

In [None]:
#Data Splitting and Model Training

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor # We will use Regressor for demonstration, but Classifier is more appropriate for binary outcome.
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model.
# A Regressor predicts a continuous value. Since our target is 0 or 1, a Classifier would be more standard,
# but a Regressor will still work and provides a good learning example.
# n_estimators is the number of trees in the forest.
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)


Step 6: Model Training

In [None]:
# Train the model on the training data
print("Training the Random Forest model...")
model.fit(X_train, y_train)
print("Model training complete.")

# Perform 5-fold cross-validation
print("\nPerforming Cross-Validation...")
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Cross-Validation R2 Scores: {cv_scores}")
print(f"Average CV R2 Score: {np.mean(cv_scores):.4f}")

Step 7: Model Evaluation

In [None]:


# Make predictions on the test data
y_pred = model.predict(X_test)

# Since the regressor outputs continuous values, we'll round them to get 0 or 1
y_pred_classified = np.round(y_pred)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred_classified)

print("\nModel Performance on Test Set:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2 Score): {r2:.4f}")
print(f"Accuracy Score: {accuracy:.4f}")

In [None]:
# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classified))

# With the model now trained and evaluated, the next step is to save it for integration into web application(Streamlit).

In [None]:
import joblib

# Save the trained model and the processed columns to files
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(X.columns, 'model_columns.pkl')

print("\nModel and columns saved successfully.")

# GUI of the Web App - Streamlit

The file path for **app.py** is: /content/app.py

**Steps to launch the web app in Google Colab:**

Execute the following commands one at a time.

1.Copy the IP address from the Step 2 and then click the link in Step 3 and the press y is asked and then click the link provided.

2.Enter the IP address in Tunnel password in the opened website.

In [None]:
# Step 1: Install the Streamlit using PIP.
! pip install streamlit -q

In [None]:
# Step 2: This command prints the current public IP address.
!wget -q -O - ipv4.icanhazip.com

In [None]:
#Step 3: This command is used to run a Streamlit app in the background and expose it to the internet using LocalTunnel.
! streamlit run app.py & npx localtunnel --port 8501