In [1]:
import pandas as pd
# Load the CSV file into a DataFrame
df = pd.read_csv("/home/vicky/learning/machine-learning/sources/cleaned_data.csv")

In [2]:
# Calculate the length of each address
df['address_length'] = df['address'].apply(lambda x: len(str(x)))
print("Sample data after adding 'address_length':")
print(df[['address', 'address_length']].head(), "\n")

Sample data after adding 'address_length':
              address  address_length
0                 NaN               3
1  Street 45, City 22              18
2  Street 94, City 42              18
3  Street 58, City 50              18
4   Street 76, City 8              17 



In [3]:
# Define salary bins and labels
bins = [0, 50000, 70000, 100000]
labels = ['low', 'medium', 'high']


# Create a new column for salary categorization
df['salary_category'] = pd.cut(df['salary'], bins=bins, labels=labels, include_lowest=True)
print("Sample data after adding 'salary_category':")
print(df[['salary', 'salary_category']].head(), "\n")

Sample data after adding 'salary_category':
    salary salary_category
0  60000.0          medium
1  50000.0             low
2  60000.0          medium
3  70000.0          medium
4  60000.0          medium 



In [4]:
# Group data by department and compute mean salary and age
summary_report = df.groupby('department').agg({
    'salary': 'mean',
    'age': 'mean'
}).reset_index()


# Rename columns for clarity
summary_report.rename(columns={'salary': 'average_salary', 'age': 'average_age'}, inplace=True)
print("Summary report by department:")
print(summary_report)

Summary report by department:
  department  average_salary  average_age
0    Finance    59830.035515    48.345256
1         HR    60015.155342    48.620106
2         IT    60034.499754    48.650074
3  Marketing    60049.455984    48.419139
4    unknown    59939.954966    48.075056


In [5]:
# Save the cleaned data to CSV for further processing
transformed_data = "/home/vicky/learning/machine-learning/sources/transformed_data.csv"
df.to_csv(transformed_data, index=False)
print(f"\nCleaned data saved to {transformed_data}")


Cleaned data saved to /home/vicky/learning/machine-learning/sources/transformed_data.csv
