<a href="https://colab.research.google.com/github/zidanseno/6pm/blob/main/TalentDataGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random
import numpy as np
import matplotlib as plt
import datetime
import gdown

In [None]:
# Preparing the talent data

id = "1gS9dyMi7aX5juzMItz68FcBgU4rd8x9s"
output = "talent_data.csv"
gdown.download(id=id, output=output, quiet=False)

talent = pd.read_csv(f"/content/{output}")
talent.head()

Downloading...
From: https://drive.google.com/uc?id=1gS9dyMi7aX5juzMItz68FcBgU4rd8x9s
To: /content/talent_data.csv
100%|██████████| 1.46k/1.46k [00:00<00:00, 3.97MB/s]


Unnamed: 0,Name,Type,Topic,Subtopic,Level
0,Kenneth Jo,Technical Writer,Technical Writer,Technical Writing,S2
1,Aditya Tanos Mandala,Technical Writer,Technical Writer,Documentation,S3
2,Dewi Sari Handayani,Technical Writer,Technical Writer,Documentation,S1
3,Gabriel Kheisa,ML,Time-series,LSTM,S3
4,Rikip Ginanjar,ML,Time-series,ARIMA,S1


In [None]:
# Defining the talent variables
project_types = pd.unique(talent[talent['Type'] != 'Technical Writer']['Type']).tolist()
ML_topics = pd.unique(talent[talent['Type'] == 'ML']['Topic']).tolist()
subtopics = {}
levels = pd.unique(talent['Level']).tolist()
writers = pd.unique(talent[talent['Type'] == 'Technical Writer']['Name']).tolist()

for topic in pd.unique(talent[talent['Type'] != 'Technical Writer']['Topic']):
  subtopics[topic] = pd.unique(talent[talent['Topic'] == topic]['Subtopic']).tolist()

print(project_types)
print(ML_topics)
print(subtopics)
print(levels)
print(writers)

['ML', 'Front End', 'Back End']
['Time-series', 'Speech / Audio', 'NLP', 'Data Engineering', 'Computer Vision', 'Classification & Regression']
{'Time-series': ['LSTM', 'ARIMA'], 'Speech / Audio': ['Speech Recognition', 'Music Information Retrieval'], 'NLP': ['Topic Modeling', 'Sentiment Analysis'], 'Data Engineering': ['Data Warehousing'], 'Computer Vision': ['Object Detection'], 'Classification & Regression': ['Logistic Regression', 'Linear Regression'], 'Front End': ['React', 'Ember.js', 'Angular'], 'Back End': ['Node.js', 'Express.js', 'Django']}
['S2', 'S3', 'S1']
['Kenneth Jo', 'Aditya Tanos Mandala', 'Dewi Sari Handayani']


In [None]:
# Activating the difficulty generator function

def difficultyCounter(typeProj,month):
  if typeProj =='S1':
    diff = random.choice([1,2,3])
  elif typeProj == 'S2':
    diff = random.choice([4,5,6])
  else:
    diff = random.choice([7,8,9,10])

  if month in [1,9]:
    diff = diff + 6
  elif month in [2,8]:
    diff = diff +4
  elif month in [3,4,6,7]:
    diff = diff+2
  elif month == 10:
    diff = diff+10
  else:
    diff = diff+5

  diff = diff + random.choice(range(3,9+1))
  
  return diff

In [None]:
# Getting randomized weights

def get_weight(z,c):
  weight_ptype = []
  
  for x in range(len(z)):
    y = len(talent[talent[c] == z[x]].index)
    weight_ptype.append(y)
  
  return(weight_ptype)

In [None]:
# Creating the past project data

pIDs = []
ptypes = []
topics = []
psubtopics = []
diffs = []
workers = []
techwriters = []

start_date = datetime.date(2022, 1, 1)
end_date = datetime.date(2023, 12, 31)
num_dates = 1000

random_dates = [start_date + datetime.timedelta(days=random.randint(0,
 (end_date - start_date).days)) for _ in range(num_dates)]
random_dates.sort()

In [None]:
for i, date in enumerate(random_dates):
    # Project ID
    current = date.strftime("%d%m%y")
    pID = f'{date.strftime("%d%m%y")}{i+1:04d}'
    pIDs.append(pID)

    # Project Type
    ptype = random.choices(project_types, 
                           weights=get_weight(project_types,'Type'))[0]
    ptypes.append(ptype)

    # Project Topic
    if ptype == 'ML':
      topic = random.choices(ML_topics, weights=get_weight(ML_topics,'Topic'))[0]
      topics.append(topic)
    else:
      topic = ptype
      topics.append(topic)
    
    # Project Subtopic
    subtopic = random.choices(subtopics[topic], 
                              weights=get_weight(subtopics[topic],'Subtopic'))[0]
    psubtopics.append(subtopic)

    # Project Difficulty
    level = random.choice(levels)
    diff = difficultyCounter(level, random.choice(range(1,10+1)))
    diffs.append(diff)

    # Workers
    # Amount of Workers
    if diff in range(5,10):
      k = 2
    elif diff in range(10,15):
      k = 4
    elif diff in range(15,30):
      k = 5
    
    # The priority levels of each workers to the project
    p1 = talent[(talent['Type'] == ptype) & 
                       (talent['Topic'] == topic) &
                       (talent['Subtopic'] == subtopic) &
                       (talent['Level'] == level)]['Name'].tolist()
    p2 = talent[(talent['Type'] == ptype) & 
                       (talent['Topic'] == topic) &
                       (talent['Subtopic'] == subtopic)]['Name'].tolist()
    p2 = [s for s in p2 if s not in p1]
    p3 = talent[(talent['Type'] == ptype) & 
                       (talent['Topic'] == topic)]['Name'].tolist()
    p3 = [s for s in p3 if s not in p1 + p2]
    p4 = talent[(talent['Type'] == ptype)]['Name'].tolist()
    p4 = [s for s in p4 if s not in p1 + p2 + p3]

    # Make a list of appropriate workers
    ppl = []
    w = {1:p1, 2:p2, 3:p3, 4:p4}

    for x in range(1,len(w)+1):
      if len(w[x]) > 0 and k != 0:
        if k >= len(w[x]):
          ppl += w[x]
          k = k - len(w[x])
        else:
          ppl += random.choices(w[x],k=k)
      else:
        pass
    
    workers.append(ppl)
    
    # Technical Writer
    #techwriters.append(random.choices(writers))

In [None]:
# Convert to dataframe

data = pd.DataFrame(columns =['Project Type','Topics','Sub Topic',
                              'Difficulty','Workers',
                              #'Tech Writer'
                              ])

data['Topics']= topics
data['Sub Topic']= psubtopics
data['Project Type'] = ptypes
data['Difficulty'] = diffs
data['Workers'] = workers
#data['Tech Writer'] = techwriters

data.head()

Unnamed: 0,Project Type,Topics,Sub Topic,Difficulty,Workers
0,Front End,Front End,Ember.js,9,"[Iga Narendra Pramawijaya, Muhammad Raden Syaw..."
1,ML,Speech / Audio,Speech Recognition,8,"[I Putu Ranantha Nugraha Suparta, Putu Gede Ag..."
2,ML,NLP,Sentiment Analysis,23,"[Alvin Tan, Bagja Kurniadi, Sarah Sema Khairun..."
3,ML,NLP,Sentiment Analysis,19,"[Bagja Kurniadi, Alvin Tan, Sarah Sema Khairun..."
4,Front End,Front End,Angular,7,"[Andi Rezal Oktavianto, Imam]"


In [None]:
# Convert to csv for workers
data.to_csv('datadummy_new_grouped.csv')

# Make and convert the exploded version csv
data_exploded = data.explode('Workers').reset_index(drop=True)
data_exploded.to_csv('datadummy_new_exploded.csv')