In [2]:
import re

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from utils import load_data
import config as cfg

In [3]:
users = load_data(format='pandas')

In [4]:
users = users.drop_duplicates()

In [5]:
users.columns

Index(['login', 'id', 'node_id', 'avatar_url', 'gravatar_id', 'url',
       'html_url', 'followers_url', 'following_url', 'gists_url',
       'starred_url', 'subscriptions_url', 'organizations_url', 'repos_url',
       'events_url', 'received_events_url', 'type', 'site_admin', 'name',
       'company', 'blog', 'location', 'email', 'hireable', 'bio',
       'twitter_username', 'public_repos', 'public_gists', 'followers',
       'following', 'created_at', 'updated_at'],
      dtype='object')

In [6]:
users.location.value_counts().index

Index(['Beijing', 'China', 'Beijing, China', 'Shanghai', 'San Francisco, CA',
       'London', 'India', 'Shanghai, China', 'Berlin, Germany', 'Singapore',
       ...
       'London, UK / Italy', 'Anápolis - Goiás, Brasil', 'Shanghai Huangpu',
       'Seville, Andalusia (Spain)',
       'nohup ./bin/norfair00 brain /data > /dev/null 2>&1 &',
       'Hong Kong; Shenzhen, China', 'Taubaté, SP - Brasil', 'Rangkasbitung',
       'Mannheim, Baden-Württemberg, Germany', 'Connecticut'],
      dtype='object', length=9758)

In [7]:
cleaner = lambda text: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", text)

In [8]:
bios = users.set_index('login')['bio']
bios = bios.str.lower()
bios = bios.dropna()
print(bios.iloc[572])
bios = bios.apply(cleaner)
print(bios.iloc[572])
bios = bios.str.split().apply(' '.join)
print(bios.iloc[572])

professor at ifnmg, ph.d. in computational intelligence,  data science and machine intelligence enthusiast at minds/ufmg, researcher at @cidic-ifnmg 
professor at ifnmg  ph d  in computational intelligence   data science and machine intelligence enthusiast at minds ufmg  researcher at  cidic ifnmg 
professor at ifnmg ph d in computational intelligence data science and machine intelligence enthusiast at minds ufmg researcher at cidic ifnmg


In [9]:
bios

login
juankuquintana                       backend engineer at the kernel
josete89                     swift machine learning java javascript
stevejhkang                              interested in cloud spring
andru255                                                  developer
nylqd                                   done is better than perfect
                                        ...                        
kmanley                                                     loading
piraka9011                              roboticist i talk to robots
jurabek           software engineer mostly developing using open...
jikkujose         simulation hypothesis mindfulness decentraliza...
mouadziani        full stack software engineer working with lara...
Name: bio, Length: 33262, dtype: object

In [10]:
def check_occurences(texts, text):
    return texts.str.contains(text).sum()

In [11]:
c = CountVectorizer(max_features=200, ngram_range=(2,2), stop_words='english')

In [12]:
c.fit(bios)

In [13]:
c.get_feature_names_out()

array(['10 years', 'ai ml', 'android developer', 'android ios',
       'app developer', 'application developer',
       'artificial intelligence', 'assistant professor',
       'associate professor', 'backend developer', 'backend engineer',
       'big data', 'build things', 'building things', 'cloud computing',
       'cloud native', 'computer engineer', 'computer engineering',
       'computer graphics', 'computer science', 'computer scientist',
       'computer vision', 'core team', 'cs phd', 'cto founder',
       'currently working', 'data analysis', 'data engineer',
       'data engineering', 'data mining', 'data science',
       'data scientist', 'data visualization', 'deep learning',
       'design systems', 'designer developer', 'developer advocate',
       'developer designer', 'developer expert', 'developer founder',
       'developer javascript', 'developer love', 'developer loves',
       'developer open', 'developer php', 'developer react',
       'developer software', 'de

In [14]:
c.vocabulary_

{'backend engineer': 10,
 'machine learning': 103,
 'software engineer': 156,
 'computer engineer': 16,
 'frontend developer': 66,
 'developer open': 43,
 'open source': 113,
 'end developer': 52,
 'senior software': 149,
 'software developer': 154,
 'lead developer': 92,
 'stack web': 169,
 'web developer': 192,
 'ruby rails': 143,
 'react js': 134,
 'phd student': 118,
 'deep learning': 33,
 'computer vision': 21,
 'python developer': 131,
 'associate professor': 8,
 'data mining': 29,
 'big data': 11,
 'javascript developer': 81,
 'developer javascript': 40,
 'javascript enthusiast': 82,
 'application developer': 5,
 '10 years': 0,
 'end engineer': 54,
 'data engineer': 27,
 'programming languages': 130,
 'web development': 193,
 'cto founder': 24,
 'stack dev': 164,
 'master student': 105,
 'founder cto': 65,
 'computer scientist': 20,
 'computer engineering': 17,
 'write code': 197,
 'product engineer': 128,
 'learning researcher': 101,
 'data scientist': 31,
 'team lead': 178,
 '

In [35]:
x = (
    check_occurences(bios, 'web developer'),
    check_occurences(bios, 'backend'),
    check_occurences(bios, 'front end'),
    check_occurences(bios, 'frontend'),
    check_occurences(bios, 'backend'),
    check_occurences(bios, 'php'),
    check_occurences(bios, 'nodejs'),
    check_occurences(bios, 'vuejs'),
    check_occurences(bios, 'django'),
)
sum(x), x

(4240, (827, 373, 807, 571, 373, 703, 370, 114, 102))

In [16]:
x = (
    check_occurences(bios, 'ml'),
    check_occurences(bios, 'ai'),
    check_occurences(bios, 'machine learning'),
    check_occurences(bios, 'artificial intelligence'),
    check_occurences(bios, 'computer vision'),
    check_occurences(bios, 'natural language processing'),
    check_occurences(bios, 'nlp'),
    check_occurences(bios, 'deep learning'),
    check_occurences(bios, 'deeplearning'),
    check_occurences(bios, 'machinelearning'),    
)
sum(x), x

(5771, (683, 2812, 964, 111, 353, 73, 300, 445, 22, 8))

In [17]:
x = (
    check_occurences(bios, 'mobile'),
    check_occurences(bios, 'ios'),
    check_occurences(bios, 'android'),
    check_occurences(bios, 'swift'),
    check_occurences(bios, 'flutter'),
    
)
sum(x), x

(1778, (387, 564, 504, 165, 158))

In [18]:
x = (
    check_occurences(bios, 'data scientist'),
)
sum(x), x

(478, (478,))

In [19]:
x = (
    check_occurences(bios, 'data engineer'),
    check_occurences(bios, 'dataengineer'),
)
sum(x), x

(156, (154, 2))

In [20]:
(
    check_occurences(bios, 'software engineer'),
)

(3048,)

In [21]:
(
    check_occurences(bios, 'full stack'),
    check_occurences(bios, 'fullstack'),
)

(1329, 332)

In [22]:
(
    check_occurences(bios, 'ui ux'),
    check_occurences(bios, 'ux designer'),
)

(81, 54)

In [34]:
bios[bios.str.contains('nodejs')]

login
mmarchini                                bpf and stuff nodejs tsc
cnviradiya      opensource lover nodejs vue storefront vue js ...
GitHubJiKe          fullstack developer based on react vue nodejs
LukeLin         senior frontend nodejs developer pre alloyteam...
danillo10       fullstack developer html5 css3 javascript boos...
                                      ...                        
chandankuiry    i am python django developer nodejs enthusiast...
geun                   startup developer nodejs react reactnative
ericzon         computer engineer turned into fullstack dev in...
daonhan         net senior developer loves javascript and web ...
leux9                                         nodejs engineer ibm
Name: bio, Length: 370, dtype: object

In [1]:
classes = [
    'Web Developer', 
    'Mobile Developer', 
    'Machine Learning Engineer', 
    'Data Engineer', 
    'Data Scientist',
    'UI/UX Developer'
]

In [2]:
', '.join(classes)

'Web Developer, Mobile Developer, Machine Learning Engineer, Data Engineer, Data Scientist, UI/UX Developer'

In [37]:
classes_expr = {
    classes[0]: ['web developer', 'backend', 'front end', 'frontend', 'backend', 'php', 'nodejs', 'vuejs', 'django'],
    classes[1]: ['ios', 'android', 'swift', 'flutter'],
    classes[2]: ['ai', 'machine learning', 'artificial intelligence', 'computer vision', 'natural language processing', 'nlp', 'deep learning', 'deeplearning', 'machinelearning'],
    classes[3]: ['data engineer', 'dataengineer'],
    classes[4]: ['data scientist'],
    classes[5]: ['ui ux', 'ux designer']
}

In [38]:
labels = pd.DataFrame(index=bios.index)
for class_, exprs in classes_expr.items():
    x = pd.Series(0, index=bios.index)
    for e in exprs:
        x = x + bios.str.contains(e).astype(int)
    
    x = (x > 0 ).astype(int)
    labels[class_] = x

In [39]:
labels.sum(axis=0)

Web Developer                3449
Mobile Developer             1186
Machine Learning Engineer    4357
Data Engineer                 156
Data Scientist                478
UI/UX Developer               112
dtype: int64

In [49]:
labels

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
juankuquintana,1,0,0,0,0,0
josete89,0,1,1,0,0,0
stevejhkang,0,0,0,0,0,0
andru255,0,0,0,0,0,0
nylqd,0,0,0,0,0,0
...,...,...,...,...,...,...
kmanley,0,0,0,0,0,0
piraka9011,0,0,0,0,0,0
jurabek,0,0,0,0,0,0
jikkujose,0,0,1,0,0,0


In [41]:
bios

login
juankuquintana                       backend engineer at the kernel
josete89                     swift machine learning java javascript
stevejhkang                              interested in cloud spring
andru255                                                  developer
nylqd                                   done is better than perfect
                                        ...                        
kmanley                                                     loading
piraka9011                              roboticist i talk to robots
jurabek           software engineer mostly developing using open...
jikkujose         simulation hypothesis mindfulness decentraliza...
mouadziani        full stack software engineer working with lara...
Name: bio, Length: 33262, dtype: object

In [42]:
labels.merge(bios, left_index=True, right_index=True)

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer,bio
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
007arunwilson,1,0,0,0,0,0,exploring js nodejs dev former php dev react n...
007jedgar,0,0,0,0,0,0,mobile web app dev
00Kai0,0,0,0,0,0,0,coding for fun currently working on game party...
00imvj00,0,0,0,0,0,0,engineer
0101011,0,0,0,0,0,0,software engineer
...,...,...,...,...,...,...,...
yysu,1,0,1,0,0,0,aws certified all 5 familiar with python conta...
yysu,1,0,1,0,0,0,aws certified all 5 familiar with python conta...
zhouzi,0,0,0,0,0,0,software engineer with a passion for user expe...
zhukovgreen,0,0,0,1,0,0,software and data engineer


In [50]:
labeled_users = labels[labels.sum(axis=1) > 0]

In [51]:
labeled_users

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
juankuquintana,1,0,0,0,0,0
josete89,0,1,1,0,0,0
addappcn,0,1,0,0,0,0
mavisland,1,0,0,0,0,0
babakasse,0,0,1,0,0,0
...,...,...,...,...,...,...
fredwu,0,0,1,0,0,0
danielpetrica,1,0,1,0,0,0
gulullu,1,0,0,0,0,0
jikkujose,0,0,1,0,0,0


In [52]:
labeled_users.sum(axis=0)

Web Developer                3449
Mobile Developer             1186
Machine Learning Engineer    4357
Data Engineer                 156
Data Scientist                478
UI/UX Developer               112
dtype: int64

In [46]:
labeled_users

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
juankuquintana,1,0,0,0,0,0
josete89,0,1,1,0,0,0
addappcn,0,1,0,0,0,0
mavisland,1,0,0,0,0,0
babakasse,0,0,1,0,0,0
...,...,...,...,...,...,...
fredwu,0,0,1,0,0,0
danielpetrica,1,0,1,0,0,0
gulullu,1,0,0,0,0,0
jikkujose,0,0,1,0,0,0


In [53]:
labeled_users = labeled_users[~labeled_users.index.duplicated(keep='first')]

In [54]:
labeled_users.to_csv(cfg.USER_LABELS_FILE)

In [55]:
label_counts = labeled_users.sum(axis=1)
label_counts[label_counts == 1]

login
juankuquintana    1
addappcn          1
mavisland         1
babakasse         1
jlcallalle        1
                 ..
chenshun00        1
tak1n             1
fredwu            1
gulullu           1
mouadziani        1
Length: 8037, dtype: int64

In [56]:
label_counts[label_counts == 2]

login
josete89         2
justmeshishir    2
iwitaly          2
mraible          2
Concert0         2
                ..
flyingant        2
mhink            2
mikevocalz       2
jiangzhongkai    2
danielpetrica    2
Length: 794, dtype: int64

In [57]:
label_counts[label_counts == 3]

login
MinDBreaK           3
liangfeidotme       3
hjJunior            3
EvertonTomalok      3
Hamza5              3
spidergears         3
dimpurr             3
megatux             3
nishankbhati        3
abinj               3
peterfei            3
ksopyla             3
Anwesh43            3
sebastianconcept    3
Aries0d0f           3
Bharathbrothers     3
dgfigueroa29        3
ankurp              3
chikaobuah          3
jasonsaayman        3
mrhieu              3
nimomeng            3
ErickPetru          3
ramshid             3
arora-72            3
fancyfrees          3
TejasBhalerao       3
itinance            3
Muhammad7Salah      3
joaodaher           3
tgrrr               3
rahulkumaran        3
Froyo91             3
dtype: int64

In [58]:
bios.loc[label_counts[label_counts == 4].index]

login
VakinduPhilliam    node js backend frontend developer react ui ux...
ssekuwanda         pythonista data scientist ml ai django web dev...
Name: bio, dtype: object

In [60]:
bios.loc['VakinduPhilliam']

'node js backend frontend developer react ui ux flutter restful apis java python django ai javascript postgresql blockchain golang scala akka iot'