In [None]:
# Importing necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, Dropout
from tensorflow.keras import regularizers
import tensorflow as tf

In [1]:
# Reading the dataset and cleaning missing values
df = pd.read_csv("Data Salary Prediction.csv")
df = df[df['DOJ'].notna()]
df = df[df['AGE'].notna()]
df = df[df['LEAVES REMAINING'].notna()]
df = df[df['LEAVES USED'].notna()]
df.reset_index(drop=True, inplace=True)

In [None]:
# Displaying the first 10000 rows of the dataset
df.head(10000)

# Checking the number of missing values in the 'UNIT' column
df['UNIT'].isna().sum()

In [None]:
# Plotting the distribution of the 'SALARY' column
sns.histplot(df['SALARY'], kde=True)
plt.title('Distribution of SALARY')
plt.show()

In [None]:
# Creating a boxplot for the 'SALARY' column
sns.boxplot(x=df['SALARY'])
plt.title('Boxplot of SALARY')
plt.show()

In [None]:
# Exploring correlations between variables using a heatmap
df= pd.get_dummies(df, columns=['DESIGNATION'], prefix='Desig')
correlation_matrix = df.corr()

# Visualizing the correlation matrix using a heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Exploring relationships between 'SEX', 'PAST EXP', and 'SALARY'
sns.pairplot(df, x_vars=['SEX', 'PAST EXP'], y_vars='SALARY', kind='scatter')
plt.show()


In [None]:
# Visualizing average salary by designation using a bar plot
average_salary_by_designation = df.groupby('DESIGNATION')['SALARY'].mean().reset_index()
average_salary_by_designation = average_salary_by_designation.sort_values(by='SALARY', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='SALARY', y='DESIGNATION', data=average_salary_by_designation, palette='viridis')
plt.title('Average Salary by Designation')
plt.xlabel('Average Salary')
plt.ylabel('Designation')
plt.show()

In [None]:
# Extracting tenure information from date columns and visualizing correlation
df['DATE2'] = pd.to_datetime(df['CURRENT DATE'])
df['DATE1'] = pd.to_datetime(df['DOJ'])
df['TENURE'] = (df['DATE2'] - df['DATE1']).dt.days
df.drop(['DATE1', 'DATE2'], axis=1, inplace=True)
correlation_matrix = df.corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Encoding categorical variable 'SEX'
label_encoder = LabelEncoder()
df['SEX_ENCODED'] = label_encoder.fit_transform(df['SEX'])
df.drop('SEX_ENCODED', axis=1, inplace=True)

# Splitting the dataset into features and target variables
selected_features = ['TENURE', 'AGE', 'Desig_Analyst', 'PAST EXP', 'Desig_Associate', 'Desig_Director', 'Desig_Manager', 'Desig_Senior Analyst', 'Desig_Senior Manager']
X = df[selected_features]
Y = df['SALARY']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=30)


In [None]:
# Building a neural network model
model = Sequential([
    Dense(512, activation=LeakyReLU(alpha=0.01), kernel_regularizer=regularizers.l2(0.01), input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(256, activation=LeakyReLU(alpha=0.01), kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(128, activation=LeakyReLU(alpha=0.01), kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(64, activation=LeakyReLU(alpha=0.01), kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(32, activation=LeakyReLU(alpha=0.01), kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(1)
])
custom_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=custom_optimizer, loss='mean_squared_error', metrics=['mae'])



In [None]:
# Evaluating the model performance on the test set
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = model.predict(X_test)
mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = mean_squared_error(Y_test, y_pred, squared=False)
r2 = r2_score(Y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')