In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Load your dataset
df = pd.read_csv('/content/Salaries.csv')
df.head()
# get the number of rows and colomns
num_rows, num_cols=df.shape
print(f'Number of rows: {num_rows}')
print(f'Number of colomns: {num_cols}')
#get the data types of each colomn
data_types=df.dtypes
print(data_types)
#check for missing values in each colomn
missing_values=df.isnull().sum()
print(missing_values)
#Descriptive Statistics
#first step we calculate the total salary for each employee and add a new colomn to it 
#first we fill null values with new zeros for relevant colomns
colomns_to_fill=['BasePay','OvertimePay','OtherPay','Benefits']
df[colomns_to_fill]=df[colomns_to_fill].fillna(0)
#calculate total pay and total pay including benefits
df['TotalPay']=df['BasePay']+df['OvertimePay']+df['OtherPay']
df['TotalPayBenefits']=df['TotalPay']+df['Benefits']
#add a new colomn 'salary' with the sum of 'total_pay' and 'total_pay_benefits'
df['salary']=df['TotalPay']+df['TotalPayBenefits']
print(df[['TotalPay','TotalPayBenefits','salary']])
#mean of salaries
mean_salary=df['salary'].mean()
print(f'mean salary:{mean_salary}')
#median of salaries
median_salary=df['salary'].median()
print(f'median salary:{median_salary}')
#mode of salaries
mode_salary=df['salary'].mode()
print(f'mode salary:{mode_salary.values[0]}')
#note that 'mode()' returns a series,so we use 'values[0]' to extract the actual mode value
#minimum of salaries
min_salary=df['salary'].min()
print(f'minimum salary:{min_salary}')
#maximum of salaries
max_salary=df['salary'].max()
print(f'maximum salary:{max_salary}')
#range of salaries
salary_range=df['salary'].max()-df['salary'].min()
print(f'salary range:{salary_range}')
#standard deviation
std_dev_salary=df['salary'].std()
print(f'standard deviation of salary:{std_dev_salary}')
#data cleaning
#fill missing values with the mean for numerical colomns
df.fillna(df.mean(),inplace=True)
#mean imputation:filling missing values with the mean is a simple and quick method it works when the missing data is approximately normal
#.it helps to maintain the overall central tendency of the data
#histogram to visualize the distribution of various salary 
plt.figure(figsize=(12,8))
plt.hist([df['BasePay'],df['OvertimePay'],df['OtherPay'],df['Benefits'],df['TotalPay'],df['TotalPayBenefits']],
bins=30,stacked=True,label=['Base Pay','Overtime Pay','Other Pay','Benefits','Total Pay','Total Pay with Benefits'],
color=['blue','orange','green','purple','pink','brown'],edgecolor='black')
plt.title('Distribution of Salaries by Components')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.legend()
plt.show()
#pie chart to represent the proportion of employees in different departments
department_counts=df['JobTitle'].value.counts()
colors=np.random.ran(len(department_counts),3)
plt.figure(figsize=(5,5))
plt.pie(department_counts,labels_counts.index,autopct='%1.1f%%',startangle=140,colors=colors,explode=(0.1,0,0,0,0))
plt.title('Proportion of Employees in Different Departments')
plt.show()
#Group the data by 'JobTitle' or (department) and calculat summary statistic
summary_statistics=df.groupby('JobTitle').agg({
    'BasePay':['mean','median','std'],
    'OvertimePay':['mean','median','std'],
    'OtherPay':['mean','median','std'],
    'Benefits':['mean','median','std'],
    'TotalPay':['mean','median','std'],
    'TotalPayBenefits':['mean','median','std'],
    'salary':['mean','median','std']
}).reset_index()
print(summary_statistics)
#Group the data by 'JobTitle' and calculate mean salary for each group
mean_salary_by_department=df.groupby('JobTitle')['salary'].mean().reset_index()
#plot a bar chart to compare average salaries across different groups
plt.figure(figsize=(10,6))
plt.bar(mean_salary_by_department['JobTitle'],mean_salary_by_department['salary'],color='skyblue')
plt.title('Average Salaries Across Different Departments')
plt.xlabel('Department')
plt.ylabel('Average Salary')
plt.show()
#identify the correlation between 'salary' and '?Year'
correlation=df['salary'].corr(df['Year'])
#plot a scatter plot to visualize the relationship
plt.figure(figsize=(10,6))
plt.scatter(df['Year'],df['salary'],color='orange',alpha=0.7)
plt.title('Scatter Plot: Salary vs Years')
plt.xlabe('Years')
plt.ylabel('Salary')
plt.show()