In [81]:
from faker import Faker
import pandas as pd
import numpy as np
import random

from datetime import date

fake = Faker()

def generate_employee_data(n_rows):
    """Generate realistic employee data"""
    data = { 
        "Employee_ID": range(1, n_rows + 1),
        "Name": [fake.name() for _ in range(n_rows)],
        "Age": [random.randint(22, 65) for _ in range(n_rows)],
        "Salary": [round(random.uniform(35000, 150000), 2) for _ in range(n_rows)],
        "City": [fake.city() for _ in range(n_rows)],
        "Joining_Date": [fake.date_between(start_date='-10y', end_date='today') for _ in range(n_rows)],
        "Email": [fake.email() for _ in range(n_rows)],
        "Phone": [fake.phone_number() for _ in range(n_rows)],
        "Department": [random.choice(['Sales', 'IT', 'Marketing', 'HR', 'Finance']) for _ in range(n_rows)],
        "Job_Title": [fake.job() for _ in range(n_rows)]
    }
    return pd.DataFrame(data)

# Generate 100 employees
df = generate_employee_data(100)
print(df.head(10))
print(f"\nDataFrame shape: {df.shape}")

# Save to CSV
df.to_csv('employees_data.csv', index=False)

   Employee_ID                 Name  Age     Salary               City  \
0            1          Bryan Silva   60   51593.58  North Janicehaven   
1            2        Andrew Knight   42   83172.85        West Daniel   
2            3   Pamela Blankenship   52   94469.61         West Wendy   
3            4  Catherine Hernandez   32   98175.72       Jameschester   
4            5      Joann Henderson   64  111366.58      West Jennifer   
5            6         Donna Morris   53  113107.58        Lake Debbie   
6            7       Allison Garcia   35  105687.75           New Gail   
7            8     Sandra Rodriguez   61   73007.78        Michaelstad   
8            9        Mindy Salinas   22   87900.02    East Joshuaside   
9           10        Loretta Brown   44  118983.08        North James   

  Joining_Date                          Email                 Phone  \
0   2016-06-12          uwilliams@example.net     467.229.9540x9930   
1   2021-05-03             dale57@example.n

## Task 1


In [82]:
filtered_df=df[(df['Age']>=25)
               & (df['Age']<=50) 
               & (df['Salary']>60000) 
               & (df['Department'].isin(['IT','Finance']))]
print("\nFiltered DataFrame:")
print(filtered_df)



Filtered DataFrame:
    Employee_ID                 Name  Age     Salary                City  \
1             2        Andrew Knight   42   83172.85         West Daniel   
3             4  Catherine Hernandez   32   98175.72        Jameschester   
9            10        Loretta Brown   44  118983.08         North James   
11           12           Paul Smith   25  119709.94        Griffithland   
17           18         Scott Miller   34  142374.85           Scottside   
25           26      Danielle George   43  101218.46         Michaelfurt   
39           40         Brandy Young   35   94702.92       West Brittany   
44           45        Denise Wilson   50  115802.36       West Nicholas   
54           55         Dylan Miller   29  103578.08         Herrerafurt   
56           57          Robert Long   45  133393.14         Port Angela   
67           68           Emily Dunn   39  146416.29   North Stephenfort   
88           89       Laura Stephens   48   63162.26         Annett

## Task 2

In [83]:
selected_df_loc=df.loc[2:4, ['Name','Age','Salary']]
print("\nSelected DataFrame using loc:")
print(selected_df_loc)

selected_df_iloc=df.iloc[0:3,0:2]
print("\nSelected DataFrame using iloc:")
print(selected_df_iloc)


Selected DataFrame using loc:
                  Name  Age     Salary
2   Pamela Blankenship   52   94469.61
3  Catherine Hernandez   32   98175.72
4      Joann Henderson   64  111366.58

Selected DataFrame using iloc:
   Employee_ID                Name
0            1         Bryan Silva
1            2       Andrew Knight
2            3  Pamela Blankenship


## Task 3

In [84]:
df.loc[random.sample(range(len(df)), 5), "Salary"] = np.nan
df.loc[random.sample(range(len(df)), 3), "Department"] = np.nan
df.loc[random.sample(range(len(df)), 4), "Age"] = np.nan

missing_counts = df.isnull().sum()
print(missing_counts)

print("\nDataFrame with missing values:")
print(df)

mean_salary = df['Salary'].mean()
df['Salary']=df['Salary'].fillna(mean_salary)

df['Department']=df['Department'].fillna('Unknown')

df.dropna(subset=['Age'], inplace=True)

df_with_no_missing_values=df.dropna()
print("\nDataFrame after handling missing values:")
print(df_with_no_missing_values)

Employee_ID     0
Name            0
Age             4
Salary          5
City            0
Joining_Date    0
Email           0
Phone           0
Department      3
Job_Title       0
dtype: int64

DataFrame with missing values:
    Employee_ID                 Name   Age     Salary               City  \
0             1          Bryan Silva  60.0   51593.58  North Janicehaven   
1             2        Andrew Knight  42.0   83172.85        West Daniel   
2             3   Pamela Blankenship  52.0   94469.61         West Wendy   
3             4  Catherine Hernandez  32.0   98175.72       Jameschester   
4             5      Joann Henderson  64.0  111366.58      West Jennifer   
..          ...                  ...   ...        ...                ...   
95           96         Allison Chen  35.0  145970.42         New Miguel   
96           97        Robert Bishop  38.0  136866.71        Lake Steven   
97           98         Daniel Smith  30.0   75090.99          Julieland   
98           99

## Task 4

In [85]:
def generate_student_data(n_rows):
    """Generate sample student data"""
    data = {
        "Student_ID": range(1, n_rows + 1),
        "Name": [fake.name() for _ in range(n_rows)],
        "Date_of_Birth": [
            fake.date_of_birth(minimum_age=18, maximum_age=30) for _ in range(n_rows)
        ],
        "GPA": [round(random.uniform(2.0, 4.0), 2) for _ in range(n_rows)],
        "Credits": [random.randint(0, 120) for _ in range(n_rows)],
        "Major": [
            random.choice(
                ["Computer Science", "Math", "Physics", "Biology", "Economics"]
            )
            for _ in range(n_rows)
        ],
    }
    return pd.DataFrame(data)


# Generate 15 students
df_students = generate_student_data(15)
print(df_students.head())

   Student_ID               Name Date_of_Birth   GPA  Credits      Major
0           1      Carl Williams    1997-12-18  2.51       50    Biology
1           2  Elizabeth Johnson    2005-01-06  3.97      114       Math
2           3       Morgan Wells    1999-12-16  3.62      112  Economics
3           4   Michael Anderson    2006-11-20  3.39       76    Physics
4           5       Jerry Thomas    1997-11-18  3.35       31    Biology


In [86]:
gpa_category=[0,2.5,3.0,3.5,4.0]
gpa_labels=['Below Avarage','Average','Good','Excellent']

df_students['GPA_Category']=pd.cut(df_students['GPA'],bins=gpa_category,labels=gpa_labels,include_lowest=True)

today = pd.Timestamp(date.today())
df_students['Age'] = (today - pd.to_datetime(df_students['Date_of_Birth'])).dt.days // 365
df_students["GPA_Normalized"] = df_students[["GPA"]].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

print(df_students)
deans_list=df_students[(df_students['GPA']>=3.5) &  (df_students['Credits']>=60)]

print(deans_list[['Name','GPA','Credits','Major']])

major_stats = (
    df_students.groupby("Major")
    .agg(
        {
            "GPA": ["mean", "max", "min"],
            "Credits": ["mean", "max", "min"],
            "Age": "mean",
        }
    )
    .round(2)
)


print(major_stats)

    Student_ID                 Name Date_of_Birth   GPA  Credits  \
0            1        Carl Williams    1997-12-18  2.51       50   
1            2    Elizabeth Johnson    2005-01-06  3.97      114   
2            3         Morgan Wells    1999-12-16  3.62      112   
3            4     Michael Anderson    2006-11-20  3.39       76   
4            5         Jerry Thomas    1997-11-18  3.35       31   
5            6      Brian Alexander    1996-11-09  3.39      114   
6            7        Cheryl Nelson    2001-08-08  2.17        7   
7            8       Anthony Becker    1998-11-06  2.28       16   
8            9        Matthew Lewis    2001-10-17  3.83       38   
9           10  Jacqueline Mitchell    2000-02-17  2.37       71   
10          11         Monica Baker    1997-02-06  3.04       96   
11          12         Julia Tucker    2005-12-07  2.38      101   
12          13       Mary Schneider    1997-06-30  2.75       86   
13          14        Jason Lambert    2001-02-2