In [1]:
import numpy as np
import pandas as pd

In [2]:
data_file = 'HR_Analytics.csv.csv'
input_file = pd.read_csv(data_file)
input_file.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
columns = ['EmployeeNumber', 'JobRole', 'MonthlyIncome']
df = input_file[columns].copy()
df

Unnamed: 0,EmployeeNumber,JobRole,MonthlyIncome
0,1,Sales Executive,5993
1,2,Research Scientist,5130
2,4,Laboratory Technician,2090
3,5,Research Scientist,2909
4,7,Laboratory Technician,3468
...,...,...,...
1465,2061,Laboratory Technician,2571
1466,2062,Healthcare Representative,9991
1467,2064,Manufacturing Director,6142
1468,2065,Sales Executive,5390


In [4]:
df['MonthlyIncome'] = df['MonthlyIncome']*12
df

Unnamed: 0,EmployeeNumber,JobRole,MonthlyIncome
0,1,Sales Executive,71916
1,2,Research Scientist,61560
2,4,Laboratory Technician,25080
3,5,Research Scientist,34908
4,7,Laboratory Technician,41616
...,...,...,...
1465,2061,Laboratory Technician,30852
1466,2062,Healthcare Representative,119892
1467,2064,Manufacturing Director,73704
1468,2065,Sales Executive,64680


In [5]:
df.rename(columns={'MonthlyIncome':'Salary'}, inplace=True)
df

Unnamed: 0,EmployeeNumber,JobRole,Salary
0,1,Sales Executive,71916
1,2,Research Scientist,61560
2,4,Laboratory Technician,25080
3,5,Research Scientist,34908
4,7,Laboratory Technician,41616
...,...,...,...
1465,2061,Laboratory Technician,30852
1466,2062,Healthcare Representative,119892
1467,2064,Manufacturing Director,73704
1468,2065,Sales Executive,64680


In [6]:
!pip install faker
from faker import Faker
fake = Faker()
names = [fake.name() for _ in range(len(df))]
desired_format = 'FirstName LastName'
formatted_names = [f"{name.split(' ')[0]} {name.split(' ')[1]}" for name in names]

df.insert(0, 'Name', formatted_names)
df



Unnamed: 0,Name,EmployeeNumber,JobRole,Salary
0,Jennifer Boyd,1,Sales Executive,71916
1,Heather Smith,2,Research Scientist,61560
2,Beth Walker,4,Laboratory Technician,25080
3,Kelly Padilla,5,Research Scientist,34908
4,Angel Moore,7,Laboratory Technician,41616
...,...,...,...,...
1465,David Gonzalez,2061,Laboratory Technician,30852
1466,Kimberly Kelly,2062,Healthcare Representative,119892
1467,Cory Pugh,2064,Manufacturing Director,73704
1468,Autumn Roy,2065,Sales Executive,64680


In [7]:
fake = Faker()
phone_numbers = set()
desired_count = len(df)

while len(phone_numbers) < desired_count:
    phone_numbers.add(fake.phone_number())

desired_format = '(###)-###-####'
formatted_phone_numbers = [fake.numerify(desired_format) for _ in range(desired_count)]

df.insert(2, 'PhoneNumber', formatted_phone_numbers)
df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,Salary
0,Jennifer Boyd,1,(578)-285-8371,Sales Executive,71916
1,Heather Smith,2,(571)-982-0381,Research Scientist,61560
2,Beth Walker,4,(022)-091-9120,Laboratory Technician,25080
3,Kelly Padilla,5,(132)-530-4546,Research Scientist,34908
4,Angel Moore,7,(411)-108-8099,Laboratory Technician,41616
...,...,...,...,...,...
1465,David Gonzalez,2061,(285)-108-6327,Laboratory Technician,30852
1466,Kimberly Kelly,2062,(406)-242-6488,Healthcare Representative,119892
1467,Cory Pugh,2064,(795)-737-5063,Manufacturing Director,73704
1468,Autumn Roy,2065,(474)-301-8082,Sales Executive,64680


In [8]:
import random
states = ['CT', 'MD', 'NY', 'MN', 'MA']

location_list = [random.choice(states) for _ in range(len(df))]
df.insert(4, 'WorkLocation', location_list)
df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,WorkLocation,Salary
0,Jennifer Boyd,1,(578)-285-8371,Sales Executive,MN,71916
1,Heather Smith,2,(571)-982-0381,Research Scientist,MD,61560
2,Beth Walker,4,(022)-091-9120,Laboratory Technician,CT,25080
3,Kelly Padilla,5,(132)-530-4546,Research Scientist,MD,34908
4,Angel Moore,7,(411)-108-8099,Laboratory Technician,MD,41616
...,...,...,...,...,...,...
1465,David Gonzalez,2061,(285)-108-6327,Laboratory Technician,CT,30852
1466,Kimberly Kelly,2062,(406)-242-6488,Healthcare Representative,MD,119892
1467,Cory Pugh,2064,(795)-737-5063,Manufacturing Director,MN,73704
1468,Autumn Roy,2065,(474)-301-8082,Sales Executive,MA,64680


In [9]:
df['Username'] = df['Name'].str.split(' ').str[0].str[:3]+df['Name'].str.split(' ').str[-1]+df['EmployeeNumber'].astype(str)
df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,WorkLocation,Salary,Username
0,Jennifer Boyd,1,(578)-285-8371,Sales Executive,MN,71916,JenBoyd1
1,Heather Smith,2,(571)-982-0381,Research Scientist,MD,61560,HeaSmith2
2,Beth Walker,4,(022)-091-9120,Laboratory Technician,CT,25080,BetWalker4
3,Kelly Padilla,5,(132)-530-4546,Research Scientist,MD,34908,KelPadilla5
4,Angel Moore,7,(411)-108-8099,Laboratory Technician,MD,41616,AngMoore7
...,...,...,...,...,...,...,...
1465,David Gonzalez,2061,(285)-108-6327,Laboratory Technician,CT,30852,DavGonzalez2061
1466,Kimberly Kelly,2062,(406)-242-6488,Healthcare Representative,MD,119892,KimKelly2062
1467,Cory Pugh,2064,(795)-737-5063,Manufacturing Director,MN,73704,CorPugh2064
1468,Autumn Roy,2065,(474)-301-8082,Sales Executive,MA,64680,AutRoy2065


In [10]:
default_password = 'Password123'
df['Password'] = default_password
df

Unnamed: 0,Name,EmployeeNumber,PhoneNumber,JobRole,WorkLocation,Salary,Username,Password
0,Jennifer Boyd,1,(578)-285-8371,Sales Executive,MN,71916,JenBoyd1,Password123
1,Heather Smith,2,(571)-982-0381,Research Scientist,MD,61560,HeaSmith2,Password123
2,Beth Walker,4,(022)-091-9120,Laboratory Technician,CT,25080,BetWalker4,Password123
3,Kelly Padilla,5,(132)-530-4546,Research Scientist,MD,34908,KelPadilla5,Password123
4,Angel Moore,7,(411)-108-8099,Laboratory Technician,MD,41616,AngMoore7,Password123
...,...,...,...,...,...,...,...,...
1465,David Gonzalez,2061,(285)-108-6327,Laboratory Technician,CT,30852,DavGonzalez2061,Password123
1466,Kimberly Kelly,2062,(406)-242-6488,Healthcare Representative,MD,119892,KimKelly2062,Password123
1467,Cory Pugh,2064,(795)-737-5063,Manufacturing Director,MN,73704,CorPugh2064,Password123
1468,Autumn Roy,2065,(474)-301-8082,Sales Executive,MA,64680,AutRoy2065,Password123


In [11]:
df.to_csv('Directory.csv', index=False)

In [12]:
# !pip install pymongo
# from pymongo import MongoClient

In [13]:
# df2 = pd.read_csv('Directory.csv')
# mongo_upload = df2.to_dict(orient='records')

# client = MongoClient('mongodb://localhost:27017') #your mongo connection details
# db = client['databasename'] #actual db name
# collection = db['collectionname'] #actualconnection name

# collection.insert_many(mongo_upload)

# print("Data inserted into MongoDB!")