### Preprocessing

In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
url = "https://docs.google.com/spreadsheets/d/1VP9BE_eI2yl6uUHSm4mGiiwjRdoqCqnkcIjsv5Q2ex4/export?format=csv"
df = pd.read_csv(url)

# Check the initial data
print("Initial data:")
print(df.head())

# Correct the data in the "height" column
df['height'] = np.random.randint(150, 181, size=len(df))

# Check the corrected data
print("\nCorrected data:")
print(df.head())

# Retrieve the 'height' column after correction
height_column = df['height']
print(height_column)

Initial data:
            Name            Team  Number Position  Age  Height  Weight  \
0  Avery Bradley  Boston Celtics       0       PG   25  06-Feb     180   
1    Jae Crowder  Boston Celtics      99       SF   25  06-Jun     235   
2   John Holland  Boston Celtics      30       SG   27  06-May     205   
3    R.J. Hunter  Boston Celtics      28       SG   22  06-May     185   
4  Jonas Jerebko  Boston Celtics       8       PF   29  06-Oct     231   

             College     Salary  
0              Texas  7730337.0  
1          Marquette  6796117.0  
2  Boston University        NaN  
3      Georgia State  1148640.0  
4                NaN  5000000.0  

Corrected data:
            Name            Team  Number Position  Age  Height  Weight  \
0  Avery Bradley  Boston Celtics       0       PG   25  06-Feb     180   
1    Jae Crowder  Boston Celtics      99       SF   25  06-Jun     235   
2   John Holland  Boston Celtics      30       SG   27  06-May     205   
3    R.J. Hunter  Boston

##### Determine the distribution of employees across each team and calculate the percentage split relative to the total number of employees. 

In [None]:
# 1. Calculate the distribution of employees across each team
team_distribution = df['team'].value_counts()

# Calculate the percentage split relative to the total number of employees
total_employees = len(df)
percentage_split = (team_distribution / total_employees) * 100

# Display the results
print("Distribution of Employees Across Each Team:")
print(team_distribution)
print("\nPercentage Split Relative to Total Number of Employees:")
print(percentage_split)

##### Segregate employees based on their positions within the company.

In [None]:
employees_by_position = df.groupby('position')

# Display the number of employees in each position
for position, employees in employees_by_position:
    print(f"Position: {position}, Number of Employees: {len(employees)}")

##### Identify the predominant age group among employees. 

In [None]:
age_bins = [20, 30, 40, 50, 60, 70]  # You can adjust the age ranges as needed

# Create age groups
age_groups = pd.cut(df['age'], bins=age_bins)

# Count the number of employees in each age group
age_group_counts = age_groups.value_counts()

# Identify the predominant age group
predominant_age_group = age_group_counts.idxmax()

# Display the results
print("Number of Employees in Each Age Group:")
print(age_group_counts)
print("\nPredominant Age Group:", predominant_age_group)

# Group the data by team and calculate the total salary expenditure for each team
team_salary_expenditure = df.groupby('team')['salary'].sum()

##### Discover which team and position have the highest salary expenditure.

In [None]:
position_salary_expenditure = df.groupby('position')['salary'].sum()

# Discover which team and position have the highest salary expenditure
team_highest_salary = team_salary_expenditure.idxmax()
position_highest_salary = position_salary_expenditure.idxmax()

# Display the results
print("Team with the Highest Salary Expenditure:", team_highest_salary)
print("Position with the Highest Salary Expenditure:", position_highest_salary)

##### Investigate if there's any correlation between age and salary, and represent it visually.

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age', y='salary', data=df)
plt.title('Correlation between Age and Salary')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.grid(True)
plt.show()