In [42]:
from faker import Faker
import pandas as pd
import numpy as np
import random

fake = Faker()

def generate_employee_data(n_rows):
    """Generate realistic employee data"""
    data = { 
        "Employee_ID": range(1, n_rows + 1),
        "Name": [fake.name() for _ in range(n_rows)],
        "Age": [random.randint(22, 65) for _ in range(n_rows)],
        "Salary": [round(random.uniform(35000, 150000), 2) for _ in range(n_rows)],
        "City": [fake.city() for _ in range(n_rows)],
        "Joining_Date": [fake.date_between(start_date='-10y', end_date='today') for _ in range(n_rows)],
        "Email": [fake.email() for _ in range(n_rows)],
        "Phone": [fake.phone_number() for _ in range(n_rows)],
        "Department": [random.choice(['Sales', 'IT', 'Marketing', 'HR', 'Finance']) for _ in range(n_rows)],
        "Job_Title": [fake.job() for _ in range(n_rows)]
    }
    return pd.DataFrame(data)

# Generate 100 employees
df = generate_employee_data(100)
print(df.head(10))
print(f"\nDataFrame shape: {df.shape}")

# Save to CSV
df.to_csv('employees_data.csv', index=False)

   Employee_ID              Name  Age     Salary               City  \
0            1        Anna Parks   57   63587.22         Amyborough   
1            2       Julie Rojas   39   77946.16        South Megan   
2            3   Nicholas Martin   25  122107.01  East Clintonmouth   
3            4       Alexis Mays   40   50341.43      Frederickfurt   
4            5    Michael Holmes   61  138740.82         Lake Ruben   
5            6      Jamie Rhodes   41   52399.59     Port Kathyland   
6            7  Cristina Trevino   54   93630.84      Lake Keithton   
7            8   Kathy Fernandez   59  141332.58        Robertsside   
8            9       Cody Fisher   54  148477.59           Huntside   
9           10   Patricia Murray   24   90442.57          Larryberg   

  Joining_Date                         Email                 Phone Department  \
0   2024-11-24           jwilson@example.org    238-430-4520x05993    Finance   
1   2018-09-04           jason18@example.net      621.68

## Task 1


In [43]:
filtered_df=df[(df['Age']>=25)
               & (df['Age']<=50) 
               & (df['Salary']>60000) 
               & (df['Department'].isin(['IT','Finance']))]
print("\nFiltered DataFrame:")
print(filtered_df)



Filtered DataFrame:
    Employee_ID                Name  Age     Salary               City  \
1             2         Julie Rojas   39   77946.16        South Megan   
16           17          Melinda Wu   45   78251.34         Tuckerland   
21           22  Christopher Norris   28   95052.33   Christensenhaven   
26           27      Mercedes Ramos   29   91550.12     Elizabethburgh   
28           29         Ellen Bowen   29   95049.99     Lake Kyliefort   
38           39       Joel Martinez   27  104874.11           Coleview   
53           54  Richard Villarreal   31   66767.41   Port Ashleymouth   
54           55        Randy Walton   50   90167.49         Nancyville   
56           57       Laurie Torres   44  108520.03        Vargasville   
71           72         Amanda Hale   35   60125.70   Elizabethborough   
77           78      Ryan Middleton   34  145993.02          Brianland   
81           82        Bryan Warren   32  134088.54  West Kristinburgh   
83           84  

## Task 2

In [44]:
selected_df_loc=df.loc[2:4, ['Name','Age','Salary']]
print("\nSelected DataFrame using loc:")
print(selected_df_loc)

selected_df_iloc=df.iloc[0:3,0:2]
print("\nSelected DataFrame using iloc:")
print(selected_df_iloc)


Selected DataFrame using loc:
              Name  Age     Salary
2  Nicholas Martin   25  122107.01
3      Alexis Mays   40   50341.43
4   Michael Holmes   61  138740.82

Selected DataFrame using iloc:
   Employee_ID             Name
0            1       Anna Parks
1            2      Julie Rojas
2            3  Nicholas Martin


## Task 3

In [45]:
df.loc[random.sample(range(len(df)), 5), "Salary"] = np.nan
df.loc[random.sample(range(len(df)), 3), "Department"] = np.nan
df.loc[random.sample(range(len(df)), 4), "Age"] = np.nan

missing_counts = df.isnull().sum()
print(missing_counts)

print("\nDataFrame with missing values:")
print(df)

mean_salary = df['Salary'].mean()
df['Salary']=df['Salary'].fillna(mean_salary)

df['Department']=df['Department'].fillna('Unknown')

df.dropna(subset=['Age'], inplace=True)

df_with_no_missing_values=df.dropna()
print("\nDataFrame after handling missing values:")
print(df_with_no_missing_values)

Employee_ID     0
Name            0
Age             4
Salary          5
City            0
Joining_Date    0
Email           0
Phone           0
Department      3
Job_Title       0
dtype: int64

DataFrame with missing values:
    Employee_ID               Name   Age     Salary               City  \
0             1         Anna Parks  57.0   63587.22         Amyborough   
1             2        Julie Rojas  39.0   77946.16        South Megan   
2             3    Nicholas Martin  25.0  122107.01  East Clintonmouth   
3             4        Alexis Mays  40.0   50341.43      Frederickfurt   
4             5     Michael Holmes  61.0        NaN         Lake Ruben   
..          ...                ...   ...        ...                ...   
95           96  Matthew Schneider  27.0   82011.77   West Richardfurt   
96           97        Kayla Ellis   NaN   36219.86         Cherylberg   
97           98       Shelby Mejia  35.0  140007.44         Alyssaberg   
98           99       Isaiah Cooke 