In [1]:
import numpy as np
import pandas as pd

In [2]:
data_file = 'HR_Analytics.csv.csv'
input_file = pd.read_csv(data_file)
input_file.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
columns = ['EmployeeNumber', 'JobRole', 'MonthlyIncome']
df = input_file[columns].copy()
df

Unnamed: 0,EmployeeNumber,JobRole,MonthlyIncome
0,1,Sales Executive,5993
1,2,Research Scientist,5130
2,4,Laboratory Technician,2090
3,5,Research Scientist,2909
4,7,Laboratory Technician,3468
...,...,...,...
1465,2061,Laboratory Technician,2571
1466,2062,Healthcare Representative,9991
1467,2064,Manufacturing Director,6142
1468,2065,Sales Executive,5390


In [4]:
df['MonthlyIncome'] = df['MonthlyIncome']*12
df

Unnamed: 0,EmployeeNumber,JobRole,MonthlyIncome
0,1,Sales Executive,71916
1,2,Research Scientist,61560
2,4,Laboratory Technician,25080
3,5,Research Scientist,34908
4,7,Laboratory Technician,41616
...,...,...,...
1465,2061,Laboratory Technician,30852
1466,2062,Healthcare Representative,119892
1467,2064,Manufacturing Director,73704
1468,2065,Sales Executive,64680


In [5]:
df.rename(columns={'MonthlyIncome':'Salary'}, inplace=True)
df

Unnamed: 0,EmployeeNumber,JobRole,Salary
0,1,Sales Executive,71916
1,2,Research Scientist,61560
2,4,Laboratory Technician,25080
3,5,Research Scientist,34908
4,7,Laboratory Technician,41616
...,...,...,...
1465,2061,Laboratory Technician,30852
1466,2062,Healthcare Representative,119892
1467,2064,Manufacturing Director,73704
1468,2065,Sales Executive,64680


In [6]:
!pip install faker
from faker import Faker
fake = Faker()
names = [fake.name() for _ in range(len(df))]
desired_format = 'FirstName LastName'
formatted_names = [f"{name.split(' ')[0]} {name.split(' ')[1]}" for name in names]

df.insert(0, 'Name', formatted_names)
df



Unnamed: 0,Name,EmployeeNumber,JobRole,Salary
0,David Glass,1,Sales Executive,71916
1,Aaron Gutierrez,2,Research Scientist,61560
2,Rachel Hardy,4,Laboratory Technician,25080
3,Matthew Smith,5,Research Scientist,34908
4,Terry Smith,7,Laboratory Technician,41616
...,...,...,...,...
1465,Tara Castillo,2061,Laboratory Technician,30852
1466,Cynthia Wright,2062,Healthcare Representative,119892
1467,Benjamin Yates,2064,Manufacturing Director,73704
1468,Antonio Guerrero,2065,Sales Executive,64680


In [7]:
fake = Faker()
phone_numbers = set()
desired_count = len(df)

while len(phone_numbers) < desired_count:
    phone_numbers.add(fake.phone_number())

desired_format = '(###)-###-####'
formatted_phone_numbers = [fake.numerify(desired_format) for _ in range(desired_count)]

df.insert(2, 'PhoneNumber', formatted_phone_numbers)
df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,Salary
0,David Glass,1,(028)-592-0251,Sales Executive,71916
1,Aaron Gutierrez,2,(319)-223-8150,Research Scientist,61560
2,Rachel Hardy,4,(685)-228-8793,Laboratory Technician,25080
3,Matthew Smith,5,(946)-756-6192,Research Scientist,34908
4,Terry Smith,7,(235)-620-1447,Laboratory Technician,41616
...,...,...,...,...,...
1465,Tara Castillo,2061,(628)-785-5147,Laboratory Technician,30852
1466,Cynthia Wright,2062,(151)-385-8022,Healthcare Representative,119892
1467,Benjamin Yates,2064,(390)-924-7020,Manufacturing Director,73704
1468,Antonio Guerrero,2065,(255)-301-3072,Sales Executive,64680


In [8]:
import random
states = ['CT', 'MD', 'NY', 'MN', 'MA']

location_list = [random.choice(states) for _ in range(len(df))]
df.insert(4, 'WorkLocation', location_list)
df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,WorkLocation,Salary
0,David Glass,1,(028)-592-0251,Sales Executive,MA,71916
1,Aaron Gutierrez,2,(319)-223-8150,Research Scientist,NY,61560
2,Rachel Hardy,4,(685)-228-8793,Laboratory Technician,CT,25080
3,Matthew Smith,5,(946)-756-6192,Research Scientist,MD,34908
4,Terry Smith,7,(235)-620-1447,Laboratory Technician,NY,41616
...,...,...,...,...,...,...
1465,Tara Castillo,2061,(628)-785-5147,Laboratory Technician,MN,30852
1466,Cynthia Wright,2062,(151)-385-8022,Healthcare Representative,MA,119892
1467,Benjamin Yates,2064,(390)-924-7020,Manufacturing Director,CT,73704
1468,Antonio Guerrero,2065,(255)-301-3072,Sales Executive,NY,64680


In [9]:
df['Username'] = df['Name'].str.split(' ').str[0].str[:3]+df['Name'].str.split(' ').str[-1]+df['EmployeeNumber'].astype(str)
df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,WorkLocation,Salary,Username
0,David Glass,1,(028)-592-0251,Sales Executive,MA,71916,DavGlass1
1,Aaron Gutierrez,2,(319)-223-8150,Research Scientist,NY,61560,AarGutierrez2
2,Rachel Hardy,4,(685)-228-8793,Laboratory Technician,CT,25080,RacHardy4
3,Matthew Smith,5,(946)-756-6192,Research Scientist,MD,34908,MatSmith5
4,Terry Smith,7,(235)-620-1447,Laboratory Technician,NY,41616,TerSmith7
...,...,...,...,...,...,...,...
1465,Tara Castillo,2061,(628)-785-5147,Laboratory Technician,MN,30852,TarCastillo2061
1466,Cynthia Wright,2062,(151)-385-8022,Healthcare Representative,MA,119892,CynWright2062
1467,Benjamin Yates,2064,(390)-924-7020,Manufacturing Director,CT,73704,BenYates2064
1468,Antonio Guerrero,2065,(255)-301-3072,Sales Executive,NY,64680,AntGuerrero2065


In [10]:
default_password = 'Password123'
df['Password'] = default_password
df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,WorkLocation,Salary,Username,Password
0,David Glass,1,(028)-592-0251,Sales Executive,MA,71916,DavGlass1,Password123
1,Aaron Gutierrez,2,(319)-223-8150,Research Scientist,NY,61560,AarGutierrez2,Password123
2,Rachel Hardy,4,(685)-228-8793,Laboratory Technician,CT,25080,RacHardy4,Password123
3,Matthew Smith,5,(946)-756-6192,Research Scientist,MD,34908,MatSmith5,Password123
4,Terry Smith,7,(235)-620-1447,Laboratory Technician,NY,41616,TerSmith7,Password123
...,...,...,...,...,...,...,...,...
1465,Tara Castillo,2061,(628)-785-5147,Laboratory Technician,MN,30852,TarCastillo2061,Password123
1466,Cynthia Wright,2062,(151)-385-8022,Healthcare Representative,MA,119892,CynWright2062,Password123
1467,Benjamin Yates,2064,(390)-924-7020,Manufacturing Director,CT,73704,BenYates2064,Password123
1468,Antonio Guerrero,2065,(255)-301-3072,Sales Executive,NY,64680,AntGuerrero2065,Password123


In [14]:
df.to_csv('Directory2.csv', index=True)

In [15]:
df.at[27, 'JobRole'] = 'HR' #NY
df.at[43, 'JobRole'] = 'HR' #CT
df.at[67, 'JobRole'] = 'HR' #MA
df.at[85, 'JobRole'] = 'HR' #MD
df.at[131, 'JobRole'] = 'HR' #MA
df.at[151, 'JobRole'] = 'HR' #MN
df.at[156, 'JobRole'] = 'HR' #CT
df.at[212, 'JobRole'] = 'HR' #NY
df.at[304, 'JobRole'] = 'HR' #MD
df.at[434, 'JobRole'] = 'HR' #MN

In [16]:
df.to_csv('Directory2.csv', index=True)

In [37]:
managers_list = []
manager_list = df[df['JobRole'] == 'Manager']
manager_ids = manager_list['EmployeeNumber'].tolist()

In [38]:
df['EmployeeManager'] = 'NaN'
print(manager_ids)
df

[23, 32, 38, 84, 140, 148, 153, 158, 199, 226, 253, 259, 264, 298, 319, 323, 327, 329, 336, 363, 374, 376, 381, 410, 428, 429, 444, 473, 505, 531, 546, 549, 558, 568, 569, 582, 597, 613, 625, 644, 664, 671, 731, 734, 776, 787, 809, 820, 851, 905, 976, 981, 992, 1029, 1038, 1045, 1048, 1074, 1109, 1116, 1124, 1191, 1201, 1204, 1215, 1256, 1267, 1277, 1280, 1282, 1288, 1306, 1307, 1321, 1334, 1336, 1338, 1352, 1408, 1516, 1520, 1527, 1550, 1578, 1591, 1595, 1602, 1625, 1644, 1665, 1676, 1677, 1740, 1786, 1824, 1866, 1892, 1900, 1938, 1973, 2022, 2031]


Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,WorkLocation,Salary,Username,Password,EmployeeManager
0,David Glass,1,(028)-592-0251,Sales Executive,MA,71916,DavGlass1,Password123,
1,Aaron Gutierrez,2,(319)-223-8150,Research Scientist,NY,61560,AarGutierrez2,Password123,
2,Rachel Hardy,4,(685)-228-8793,Laboratory Technician,CT,25080,RacHardy4,Password123,
3,Matthew Smith,5,(946)-756-6192,Research Scientist,MD,34908,MatSmith5,Password123,
4,Terry Smith,7,(235)-620-1447,Laboratory Technician,NY,41616,TerSmith7,Password123,
...,...,...,...,...,...,...,...,...,...
1465,Tara Castillo,2061,(628)-785-5147,Laboratory Technician,MN,30852,TarCastillo2061,Password123,
1466,Cynthia Wright,2062,(151)-385-8022,Healthcare Representative,MA,119892,CynWright2062,Password123,
1467,Benjamin Yates,2064,(390)-924-7020,Manufacturing Director,CT,73704,BenYates2064,Password123,
1468,Antonio Guerrero,2065,(255)-301-3072,Sales Executive,NY,64680,AntGuerrero2065,Password123,


In [44]:
for index, row in df.iterrows():
    if row['JobRole'] != 'Manager':
        df.at[index, 'EmployeeManager'] = random.choice(manager_ids)

df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,WorkLocation,Salary,Username,Password,EmployeeManager
0,David Glass,1,(028)-592-0251,Sales Executive,MA,71916,DavGlass1,Password123,644
1,Aaron Gutierrez,2,(319)-223-8150,Research Scientist,NY,61560,AarGutierrez2,Password123,851
2,Rachel Hardy,4,(685)-228-8793,Laboratory Technician,CT,25080,RacHardy4,Password123,323
3,Matthew Smith,5,(946)-756-6192,Research Scientist,MD,34908,MatSmith5,Password123,1408
4,Terry Smith,7,(235)-620-1447,Laboratory Technician,NY,41616,TerSmith7,Password123,1973
...,...,...,...,...,...,...,...,...,...
1465,Tara Castillo,2061,(628)-785-5147,Laboratory Technician,MN,30852,TarCastillo2061,Password123,428
1466,Cynthia Wright,2062,(151)-385-8022,Healthcare Representative,MA,119892,CynWright2062,Password123,1516
1467,Benjamin Yates,2064,(390)-924-7020,Manufacturing Director,CT,73704,BenYates2064,Password123,981
1468,Antonio Guerrero,2065,(255)-301-3072,Sales Executive,NY,64680,AntGuerrero2065,Password123,1740


In [45]:
df.to_csv('Directory2.csv', index=True)

In [None]:
# !pip install pymongo
# from pymongo import MongoClient

In [None]:
# df2 = pd.read_csv('Directory.csv')
# mongo_upload = df2.to_dict(orient='records')

# client = MongoClient('mongodb://localhost:27017') #your mongo connection details
# db = client['databasename'] #actual db name
# collection = db['collectionname'] #actualconnection name

# collection.insert_many(mongo_upload)

# print("Data inserted into MongoDB!")