<a href="https://colab.research.google.com/github/yashashwinis24/Week2-task/blob/main/Internship_day9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from google.colab import files


np.random.seed(42)
data = {
    'Date': pd.date_range(start='2024-01-01', periods=365, freq='D'),  # Full year
    'Region': np.random.choice(['North', 'South', 'East', 'West'], 365),
    'Sales': np.random.randint(100, 2000, 365)  # Realistic sales range
}
df = pd.DataFrame(data)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

print("Data loaded successfully!")
print(f"Shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())
print("\n" + "="*60 + "\n")

# Step 3: Total sales per region
total_sales_region = df.groupby('Region')['Sales'].sum().round(0)
print("TOTAL SALES PER REGION:")
print(total_sales_region)
print("\n" + "="*60 + "\n")

# Step 4: Average sales per month
monthly_avg_sales = df['Sales'].resample('ME').mean().round(2)
print("AVERAGE SALES PER MONTH:")
print(monthly_avg_sales)
print("\nOverall average monthly sales:", monthly_avg_sales.mean().round(2))
print("\n" + "="*60 + "\n")

# Step 5: Best performing region
best_region = total_sales_region.idxmax()
best_sales = total_sales_region.max()
print(f"BEST PERFORMING REGION: {best_region}")
print(f"Total sales: ${best_sales:,.0f}")
print(f"Share of total sales: {best_sales/total_sales_region.sum()*100:.1f}%")

# Summary table
summary = pd.DataFrame({
    'Total Sales': total_sales_region,
    'Monthly Avg': df.groupby('Region')['Sales'].resample('ME').mean().mean().round(2)
}).round(2)
print("\nSUMMARY TABLE:")
print(summary)

print("\nAnalysis complete! No errors.")

Data loaded successfully!
Shape: (365, 2)

First 5 rows:
           Region  Sales
Date                    
2024-01-01   East   1538
2024-01-02   West   1421
2024-01-03  North   1734
2024-01-04   East    362
2024-01-05   East   1887


TOTAL SALES PER REGION:
Region
East      97726
North     92099
South     89394
West     108315
Name: Sales, dtype: int64


AVERAGE SALES PER MONTH:
Date
2024-01-31     967.71
2024-02-29    1114.69
2024-03-31     999.39
2024-04-30     983.90
2024-05-31    1088.45
2024-06-30    1069.53
2024-07-31    1050.16
2024-08-31    1089.42
2024-09-30    1144.33
2024-10-31    1073.90
2024-11-30    1125.23
2024-12-31    1039.27
Freq: ME, Name: Sales, dtype: float64

Overall average monthly sales: 1062.16


BEST PERFORMING REGION: West
Total sales: $108,315
Share of total sales: 27.9%

SUMMARY TABLE:
        Total Sales  Monthly Avg
Region                          
East          97726      1050.16
North         92099      1050.16
South         89394      1050.16
West     

In [2]:
import pandas as pd
import numpy as np
from google.colab import files

print("Student Ranking System - Google Colab")
print("=" * 50)


data = {
    'Student': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank', 'Grace'],
    'Math': [85, 92, 78, 95, 88, 76, 91],
    'Science': [78, 85, 82, 88, 92, 79, 87],
    'English': [92, 88, 85, 90, 87, 84, 93],
    'History': [87, 91, 89, 86, 90, 88, 85]
}

df = pd.DataFrame(data)
print("\nRaw Student Data:")
print(df)
print("\n" + "="*50)

# Step 2: Calculate totals and averages
df['Total'] = df[['Math', 'Science', 'English', 'History']].sum(axis=1)
df['Average'] = df['Total'] / 4
df['Average'] = df['Average'].round(2)

print("\nData with Totals & Averages:")
print(df[['Student', 'Total', 'Average']])
print("\n" + "="*50)

# Step 3: RANKING - Method 1: Simple descending sort (Rank 1 = Top)
df_sorted = df.sort_values('Total', ascending=False).reset_index(drop=True)
df_sorted['Rank'] = df_sorted.index + 1
print("\nRANKING BY TOTAL MARKS (Simple):")
print(df_sorted[['Student', 'Total', 'Average', 'Rank']])
print("\n" + "="*50)

# Step 4: RANKING - Method 2: Proper ranking with ties (using rank())
df['Rank_Total'] = df['Total'].rank(ascending=False, method='min').astype(int)
df['Rank_Avg'] = df['Average'].rank(ascending=False, method='min').astype(int)

print("\nRANKING WITH TIES HANDLED:")
print(df[['Student', 'Total', 'Average', 'Rank_Total', 'Rank_Avg']].sort_values('Rank_Total'))
print("\n" + "="*50)

# Step 5: Top performers summary
print("\nTOP 3 STUDENTS:")
top3 = df_sorted.head(3)[['Student', 'Total', 'Average', 'Rank']]
print(top3)

print("\nCLASS STATISTICS:")
print(f"Class Average: {df['Average'].mean():.2f}")
print(f"Top Score: {df['Total'].max()}")
print(f"Minimum Passing (60%): {df[df['Average'] >= 60].shape[0]} students")

# Step 6: Grade assignment
def get_grade(avg):
    if avg >= 90: return 'A+'
    elif avg >= 80: return 'A'
    elif avg >= 70: return 'B'
    elif avg >= 60: return 'C'
    else: return 'F'

df['Grade'] = df['Average'].apply(get_grade)
print("\nFINAL RANKING WITH GRADES:")
final_ranking = df[['Student', 'Total', 'Average', 'Rank_Total', 'Grade']].sort_values('Rank_Total')
print(final_ranking)

print("\n Student Ranking Complete!")



Student Ranking System - Google Colab

Raw Student Data:
   Student  Math  Science  English  History
0    Alice    85       78       92       87
1      Bob    92       85       88       91
2  Charlie    78       82       85       89
3    Diana    95       88       90       86
4      Eve    88       92       87       90
5    Frank    76       79       84       88
6    Grace    91       87       93       85


Data with Totals & Averages:
   Student  Total  Average
0    Alice    342    85.50
1      Bob    356    89.00
2  Charlie    334    83.50
3    Diana    359    89.75
4      Eve    357    89.25
5    Frank    327    81.75
6    Grace    356    89.00


RANKING BY TOTAL MARKS (Simple):
   Student  Total  Average  Rank
0    Diana    359    89.75     1
1      Eve    357    89.25     2
2      Bob    356    89.00     3
3    Grace    356    89.00     4
4    Alice    342    85.50     5
5  Charlie    334    83.50     6
6    Frank    327    81.75     7


RANKING WITH TIES HANDLED:
   Student  Tota

In [3]:
import pandas as pd
import numpy as np
from google.colab import files
import matplotlib.pyplot as plt

print(" PRODUCTION QUALITY CONTROL SYSTEM")
print("=" * 60)

# Step 1: Generate sample production data (or upload CSV)
np.random.seed(42)
data = {
    'Batch_ID': ['B' + str(i).zfill(4) for i in range(1, 501)],  # 500 batches
    'Product_Type': np.random.choice(['Widget_A', 'Widget_B', 'Widget_C'], 500),
    'Shift': np.random.choice(['Morning', 'Afternoon', 'Night'], 500),
    'Units_Produced': np.random.randint(50, 200, 500),
    'Defects_Found': np.random.randint(0, 15, 500),  # 0-14 defects
    'Date': pd.date_range('2025-01-01', periods=500, freq='12H')  # Every 12 hours
}

df = pd.DataFrame(data)
df['Pass_Rate'] = ((df['Units_Produced'] - df['Defects_Found']) / df['Units_Produced'] * 100).round(2)
df['Quality_Status'] = np.where(df['Pass_Rate'] >= 95, 'PASS', 'FAIL')
df.set_index('Date', inplace=True)

print(f" Production data loaded: {df.shape[0]} batches")
print("\nSample data:")
print(df.head())
print("\n" + "="*60)

# Step 2: KEY QUALITY METRICS
total_produced = df['Units_Produced'].sum()
total_defects = df['Units_Produced'].sum()
overall_pass_rate = df['Pass_Rate'].mean().round(2)

print(" PRODUCTION SUMMARY")
print(f"Total Units Produced: {total_produced:,}")
print(f"Total Defects Found: {total_defects:,}")
print(f"Overall Pass Rate: {overall_pass_rate}%")
print(f"Failed Batches: {len(df[df['Quality_Status']=='FAIL'])} / {len(df)} ({100-len(df[df['Quality_Status']=='PASS'])/len(df)*100:.1f}%)")
print("\n" + "="*60)

# Step 3: QUALITY BY PRODUCT TYPE
quality_by_product = df.groupby('Product_Type').agg({
    'Units_Produced': 'sum',
    'Defects_Found': 'sum',
    'Pass_Rate': 'mean'
}).round(2)
quality_by_product['Defect_Rate_%'] = (quality_by_product['Defects_Found'] / quality_by_product['Units_Produced'] * 100).round(2)

print(" QUALITY BY PRODUCT TYPE")
print(quality_by_product)
print("\n" + "="*60)

# Step 4: QUALITY BY SHIFT
quality_by_shift = df.groupby('Shift').agg({
    'Units_Produced': 'sum',
    'Defects_Found': 'sum',
    'Pass_Rate': 'mean'
}).round(2)
quality_by_shift['Defect_Rate_%'] = (quality_by_shift['Defects_Found'] / quality_by_shift['Units_Produced'] * 100).round(2)

print(" QUALITY BY SHIFT")
print(quality_by_shift)
print("\n" + "="*60)

# Step 5: WORST BATCHES (Top 10 failures)
worst_batches = df.nsmallest(10, 'Pass_Rate')[['Batch_ID', 'Product_Type', 'Shift', 'Units_Produced', 'Defects_Found', 'Pass_Rate', 'Quality_Status']]
print(" TOP 10 WORST BATCHES")
print(worst_batches)
print("\n" + "="*60)

# Step 6: TREND ANALYSIS (Daily pass rates)
daily_quality = df['Pass_Rate'].resample('D').mean().round(2)
print("ðŸ“ˆ DAILY QUALITY TREND (First 10 days):")
print(daily_quality.head(10))

# Step 7: SUMMARY DASHBOARD
print("\n QUALITY DASHBOARD")
print("="*40)
print(f"Overall Pass Rate: {overall_pass_rate}%")
print(f"Critical Issues: {len(df[df['Pass_Rate'] < 90])} batches < 90%")
print(f"Warning: {len(df[(df['Pass_Rate'] >= 90) & (df['Pass_Rate'] < 95)])} batches 90-95%")
print(f"Excellent: {len(df[df['Pass_Rate'] >= 95])} batches >= 95%")

best_product = quality_by_product['Pass_Rate'].idxmax()
print(f"\n BEST PRODUCT: {best_product} ({quality_by_product['Pass_Rate'].max():.1f}% avg)")

print("\n PRODUCTION QUALITY ANALYSIS COMPLETE!")
print("\n Action Items:")
print("   - Review worst batches above")
print("   - Monitor Night shift performance")
print("   - Scale up best-performing product")


 PRODUCTION QUALITY CONTROL SYSTEM
 Production data loaded: 500 batches

Sample data:
                    Batch_ID Product_Type      Shift  Units_Produced  \
Date                                                                   
2025-01-01 00:00:00    B0001     Widget_C  Afternoon             180   
2025-01-01 12:00:00    B0002     Widget_A  Afternoon              80   
2025-01-02 00:00:00    B0003     Widget_C      Night              89   
2025-01-02 12:00:00    B0004     Widget_C  Afternoon              85   
2025-01-03 00:00:00    B0005     Widget_A    Morning              55   

                     Defects_Found  Pass_Rate Quality_Status  
Date                                                          
2025-01-01 00:00:00             10      94.44           FAIL  
2025-01-01 12:00:00              1      98.75           PASS  
2025-01-02 00:00:00              8      91.01           FAIL  
2025-01-02 12:00:00              9      89.41           FAIL  
2025-01-03 00:00:00            

  'Date': pd.date_range('2025-01-01', periods=500, freq='12H')  # Every 12 hours


In [4]:
import pandas as pd
import numpy as np
from google.colab import files

print(" BROADCASTING PRICING SYSTEM - FIXED")
print("=" * 60)

# Step 1: Sample product catalog (10,000 products)
np.random.seed(42)
n_products = 10000

data = {
    'Product_ID': ['P' + str(i).zfill(6) for i in range(1, n_products + 1)],
    'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], n_products),
    'Base_Price': np.random.uniform(10, 500, n_products).round(2),
    'Quantity': np.random.randint(1, 1000, n_products),
    'Cost_Price': np.random.uniform(5, 400, n_products).round(2)
}
df = pd.DataFrame(data)

print(f" Loaded {len(df):,} products")
print("\nSample products:")
print(df.head())
print("\n" + "="*60)

# Step 2: BROADCASTING OPERATIONS

# GLOBAL PRICE CHANGE (5% across ALL products) - SCALAR BROADCASTING
df['Price_After_Global'] = df['Base_Price'] * 1.05
print(" GLOBAL 5% PRICE INCREASE:")
print("Before:", df['Base_Price'].head().tolist())
print("After: ", df['Price_After_Global'].head().tolist())
print()

# CATEGORY-SPECIFIC DISCOUNTS - SERIES BROADCASTING
discounts = pd.Series({
    'Electronics': 0.10,  # 10% off
    'Clothing': 0.15,     # 15% off
    'Books': 0.05,        # 5% off
    'Home': 0.00          # No discount
}, name='Discount')

df['Discounted_Price'] = df['Base_Price'] * (1 - df['Category'].map(discounts))
print(" CATEGORY DISCOUNTS:")
print(discounts)
print("\nElectronics example:")
electronics = df[df['Category'] == 'Electronics'].head(1)
print(f"Before: ${electronics['Base_Price'].iloc[0]:.2f}")
print(f"After:  ${electronics['Discounted_Price'].iloc[0]:.2f}")
print()

# FIXED TAXES SECTION - CORRECT SYNTAX
print(" TAXES APPLIED (FIXED):")
tax_rate = 0.08  # 8% tax on final price
df['Final_Price'] = df['Discounted_Price'] * (1 + tax_rate)  # FIXED LINE
print(f"Tax rate applied: {tax_rate*100}%")
print("\nSample final prices:")
print(df[['Product_ID', 'Base_Price', 'Discounted_Price', 'Final_Price']].head())
print()

# REVENUE CALCULATIONS - VECTOR BROADCASTING
df['Revenue'] = df['Final_Price'] * df['Quantity']
df['Profit'] = df['Revenue'] - (df['Cost_Price'] * df['Quantity'])

print(" REVENUE & PROFIT:")
print(f"Total Revenue: ${df['Revenue'].sum():,.0f}")
print(f"Total Profit:  ${df['Profit'].sum():,.0f}")
print(f"Avg Profit Margin: {df['Profit'].sum()/df['Revenue'].sum()*100:.1f}%")
print()

# PROMOTIONAL BULK DISCOUNTS - CONDITIONAL BROADCASTING
df['Bulk_Discount'] = np.where(df['Quantity'] > 500, 0.20, 0)  # 20% for bulk
df['Bulk_Price'] = df['Final_Price'] * (1 - df['Bulk_Discount'])

print(" BULK DISCOUNTS (>500 units):")
bulk_sales = df[df['Quantity'] > 500]
print(f"Bulk orders found: {len(bulk_sales)}")
print(bulk_sales[['Product_ID', 'Quantity', 'Final_Price', 'Bulk_Price']].head())
print()

# PRICING SUMMARY BY CATEGORY
pricing_summary = df.groupby('Category').agg({
    'Base_Price': 'mean',
    'Final_Price': 'mean',
    'Revenue': 'sum',
    'Profit': 'sum'
}).round(2)

pricing_summary.columns = ['Avg_Base_Price', 'Avg_Final_Price', 'Total_Revenue', 'Total_Profit']
print(" PRICING SUMMARY BY CATEGORY:")
print(pricing_summary)
print()

# TOP PRODUCTS BY REVENUE
top_products = df.nlargest(5, 'Revenue')[['Product_ID', 'Category', 'Final_Price', 'Quantity', 'Revenue']]
print(" TOP 5 REVENUE GENERATORS:")
print(top_products.round(2))

print("\n BROADCASTING PRICING SYSTEM COMPLETE! ")
print("\n KEY BROADCASTING DEMONSTRATED:")
print("   â€¢ Scalar Ã— DataFrame (Global 5% increase)")
print("   â€¢ Series Ã— DataFrame (Category discounts)")
print("   â€¢ Vector Ã— Vector (Price Ã— Quantity)")
print("   â€¢ Conditional np.where() (Bulk discounts)")


 BROADCASTING PRICING SYSTEM - FIXED
 Loaded 10,000 products

Sample products:
  Product_ID     Category  Base_Price  Quantity  Cost_Price
0    P000001        Books      202.88       924      322.85
1    P000002         Home      241.98       278       73.67
2    P000003  Electronics      428.73        16      199.69
3    P000004        Books      176.60       525      276.77
4    P000005        Books      436.13       356      262.54

 GLOBAL 5% PRICE INCREASE:
Before: [202.88, 241.98, 428.73, 176.6, 436.13]
After:  [213.024, 254.079, 450.16650000000004, 185.43, 457.9365]

 CATEGORY DISCOUNTS:
Electronics    0.10
Clothing       0.15
Books          0.05
Home           0.00
Name: Discount, dtype: float64

Electronics example:
Before: $428.73
After:  $385.86

 TAXES APPLIED (FIXED):
Tax rate applied: 8.0%

Sample final prices:
  Product_ID  Base_Price  Discounted_Price  Final_Price
0    P000001      202.88          192.7360    208.15488
1    P000002      241.98          241.9800    261.3

In [5]:
import numpy as np
import pandas as pd  # For nice display

print(" NUMPY CUSTOMER DATA ANALYSIS - FIXED")
print("=" * 50)

# Step 1: CREATE CUSTOMER DATA WITH NUMPY (1000 customers) - NUMERIC ARRAYS ONLY
np.random.seed(42)

n_customers = 1000

# Pure numeric arrays + separate region array
customer_ids = np.random.randint(1000, 9999, n_customers).astype(int)
ages = np.random.randint(18, 80, n_customers).astype(int)
purchases = np.random.uniform(50, 2000, n_customers)  # Keep as float
regions = np.random.choice(['North', 'South', 'East', 'West'], n_customers)

print(f" Created {n_customers:,} customers")
print("\nNumeric arrays created:")
print(f"IDs shape: {customer_ids.shape}, dtype: {customer_ids.dtype}")
print(f"Purchases shape: {purchases.shape}, dtype: {purchases.dtype}")
print("\nSample data (first 5):")
for i in range(5):
    print(f"  CUST{customer_ids[i]:04d}: Age {ages[i]}, ${purchases[i]:8.2f}, {regions[i]}")
print("\n" + "="*50)

# TASK 1: Total customers
total_customers = len(purchases)
print(f" TOTAL CUSTOMERS: {total_customers:,}")
print()

# TASK 2: Extract only purchase column with SLICING - FIXED
purchase_column = purchases[:]  # Simple slicing - already numeric!
print(" PURCHASE COLUMN (SLICING):")
print(f"Shape: {purchase_column.shape}")
print(f"First 10 values: {purchase_column[:10].round(2)}")  # âœ… Works now!
print()

# TASK 3: Average purchase amount
avg_purchase = np.mean(purchases)
print(f" AVERAGE PURCHASE: ${avg_purchase:.2f}")
print()

# TASK 4: Maximum and minimum purchase
max_purchase = np.max(purchases)
min_purchase = np.min(purchases)
print(f" MAX/MIN PURCHASE:")
print(f"   Maximum: ${max_purchase:.2f}")
print(f"   Minimum: ${min_purchase:.2f}")
print()

# TASK 5: Customers above average purchase - BOOLEAN INDEXING
above_avg_mask = purchases > avg_purchase
high_spenders_count = np.sum(above_avg_mask)

print(f"CUSTOMERS ABOVE AVERAGE (${avg_purchase:.2f}):")
print(f"   Count: {high_spenders_count} / {total_customers} ({high_spenders_count/total_customers*100:.1f}%)")

# Show top 5 high spenders
top_indices = np.argsort(purchases)[::-1][:5]  # Get indices of top 5
print("\nTop 5 high spenders:")
for i, idx in enumerate(top_indices):
    print(f"   CUST{customer_ids[idx]:04d}: Age {ages[idx]}, ${purchases[idx]:8.2f}, {regions[idx]}")

print("\n" + "="*50)

# BONUS: Complete summary
print(" SUMMARY STATISTICS:")
print(f"Total Customers:      {total_customers:,}")
print(f"Avg Purchase:         ${avg_purchase:.2f}")
print(f"Median Purchase:      ${np.median(purchases):.2f}")
print(f"Std Dev:              ${np.std(purchases):.2f}")
print(f"High Spenders:        {high_spenders_count} ({high_spenders_count/total_customers*100:.1f}%)")

# Region breakdown
unique_regions, counts = np.unique(regions, return_counts=True)
print("\nCustomers by Region:")
for region, count in zip(unique_regions, counts):
    print(f"   {region:8s}: {count:4d} ({count/total_customers*100:5.1f}%)")

# Age demographics
print(f"\nAge Range: {ages.min()}-{ages.max()} years")
print(f"Avg Age:   {ages.mean():.1f} years")

print("\n NUMPY CUSTOMER ANALYSIS COMPLETE! ")
print("\n Key NumPy Techniques Used:")
print("   â€¢ np.column_stack() - Combine arrays")
print("   â€¢ purchases[:] - Slicing")
print("   â€¢ Boolean indexing: purchases > avg")
print("   â€¢ np.argsort() - Top customers")


 NUMPY CUSTOMER DATA ANALYSIS - FIXED
 Created 1,000 customers

Numeric arrays created:
IDs shape: (1000,), dtype: int64
Purchases shape: (1000,), dtype: float64

Sample data (first 5):
  CUST8270: Age 46, $ 1321.64, East
  CUST1860: Age 72, $  658.69, West
  CUST6390: Age 56, $  828.14, West
  CUST6191: Age 20, $ 1723.07, East
  CUST6734: Age 49, $   67.61, North

 TOTAL CUSTOMERS: 1,000

 PURCHASE COLUMN (SLICING):
Shape: (1000,)
First 10 values: [1321.64  658.69  828.14 1723.07   67.61 1197.4  1217.49  823.49  512.2
  507.61]

 AVERAGE PURCHASE: $1011.97

 MAX/MIN PURCHASE:
   Maximum: $1994.30
   Minimum: $50.37

CUSTOMERS ABOVE AVERAGE ($1011.97):
   Count: 502 / 1000 (50.2%)

Top 5 high spenders:
   CUST6750: Age 63, $ 1994.30, West
   CUST5973: Age 61, $ 1991.19, West
   CUST1391: Age 54, $ 1987.01, South
   CUST5171: Age 35, $ 1985.21, East
   CUST6560: Age 41, $ 1983.90, South

 SUMMARY STATISTICS:
Total Customers:      1,000
Avg Purchase:         $1011.97
Median Purchase:    

In [6]:
import numpy as np
import pandas as pd

print("EMPLOYEE DATASET ANALYSIS")
print("=" * 50)

# Step 1: CREATE EMPLOYEE DATASET (500 employees)
np.random.seed(42)
n_employees = 500

# Generate realistic employee data
employee_ids = np.random.randint(1000, 9999, n_employees).astype(int)
names = [f"Emp_{id:04d}" for id in employee_ids]
ages = np.random.normal(35, 8, n_employees).astype(int)  # Mean 35, std 8
ages = np.clip(ages, 22, 65)  # Realistic age range
salaries = np.random.normal(75000, 25000, n_employees).round(0)  # Mean 75k, std 25k
salaries = np.clip(salaries, 30000, 200000)  # Realistic salary range

# Create DataFrame for easy analysis
df = pd.DataFrame({
    'Employee_ID': employee_ids,
    'Employee_Name': names,
    'Age': ages,
    'Salary': salaries
})

print(f" Created {n_employees:,} employees")
print("\nSample data (first 5):")
print(df.head())
print("\n" + "="*50)

# TASK 1: Total salary of all employees
total_salary = df['Salary'].sum()
print(f" TOTAL SALARY: ${total_salary:,.0f}")
print()

# TASK 2: Minimum salary
min_salary = df['Salary'].min()
print(f" MINIMUM SALARY: ${min_salary:,.0f}")
print()

# TASK 3: Maximum salary
max_salary = df['Salary'].max()
print(f" MAXIMUM SALARY: ${max_salary:,.0f}")
print()

# TASK 4: Average age of employees
avg_age = df['Age'].mean()
print(f" AVERAGE AGE: {avg_age:.1f} years")
print()

# TASK 5: Total number of employees
total_employees = len(df)
print(f" TOTAL EMPLOYEES: {total_employees:,}")
print()

# TASK 6: Correlation between age and salary
correlation = df['Age'].corr(df['Salary'])
print(f" AGE-SALARY CORRELATION: {correlation:.3f}")
print("   (1.0 = perfect positive, 0 = no correlation, -1.0 = perfect negative)")
print()

# BONUS: Detailed Summary
print(" EMPLOYEE SUMMARY DASHBOARD")
print("="*40)
print(f" Salary Range:    ${min_salary:,.0f} - ${max_salary:,.0f}")
print(f" Total Payroll:   ${total_salary:,.0f}")
print(f" Total Employees: {total_employees:,}")
print(f" Average Age:     {avg_age:.1f} years")
print(f" Age-Salary Corr: {correlation:.3f}")
print(f" Median Salary:   ${df['Salary'].median():,.0f}")

# Age groups
age_groups = pd.cut(df['Age'], bins=[20, 30, 40, 50, 60], labels=['20s', '30s', '40s', '50+'])
print("\nAge Distribution:")
print(age_groups.value_counts().sort_index())

# Top 5 highest paid
print("\n TOP 5 HIGHEST PAID:")
top_5 = df.nlargest(5, 'Salary')[['Employee_Name', 'Age', 'Salary']]
print(top_5)

print("\n EMPLOYEE ANALYSIS COMPLETE!")


EMPLOYEE DATASET ANALYSIS
 Created 500 employees

Sample data (first 5):
   Employee_ID Employee_Name  Age    Salary
0         8270      Emp_8270   42   47647.0
1         1860      Emp_1860   38   30000.0
2         6390      Emp_6390   35   64058.0
3         6191      Emp_6191   35   45342.0
4         6734      Emp_6734   23  139943.0

 TOTAL SALARY: $38,647,008

 MINIMUM SALARY: $30,000

 MAXIMUM SALARY: $154,184

 AVERAGE AGE: 35.2 years

 TOTAL EMPLOYEES: 500

 AGE-SALARY CORRELATION: 0.045
   (1.0 = perfect positive, 0 = no correlation, -1.0 = perfect negative)

 EMPLOYEE SUMMARY DASHBOARD
 Salary Range:    $30,000 - $154,184
 Total Payroll:   $38,647,008
 Total Employees: 500
 Average Age:     35.2 years
 Age-Salary Corr: 0.045
 Median Salary:   $76,123

Age Distribution:
Age
20s    152
30s    213
40s    118
50+     17
Name: count, dtype: int64

 TOP 5 HIGHEST PAID:
    Employee_Name  Age    Salary
41       Emp_3612   34  154184.0
210      Emp_9120   36  152622.0
4        Emp_6734