In [1]:
import re

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from utils import load_data
import config as cfg

In [2]:
users = load_data(format='pandas')

In [3]:
users = users.drop_duplicates()

In [4]:
users.columns

Index(['login', 'id', 'node_id', 'avatar_url', 'gravatar_id', 'url',
       'html_url', 'followers_url', 'following_url', 'gists_url',
       'starred_url', 'subscriptions_url', 'organizations_url', 'repos_url',
       'events_url', 'received_events_url', 'type', 'site_admin', 'name',
       'company', 'blog', 'location', 'email', 'hireable', 'bio',
       'twitter_username', 'public_repos', 'public_gists', 'followers',
       'following', 'created_at', 'updated_at'],
      dtype='object')

In [5]:
users.location.value_counts().index

Index(['Beijing', 'China', 'Beijing, China', 'Shanghai', 'San Francisco, CA',
       'London', 'India', 'Shanghai, China', 'Berlin, Germany', 'Singapore',
       ...
       'London, UK / Italy', 'Anápolis - Goiás, Brasil', 'Shanghai Huangpu',
       'Seville, Andalusia (Spain)',
       'nohup ./bin/norfair00 brain /data > /dev/null 2>&1 &',
       'Hong Kong; Shenzhen, China', 'Taubaté, SP - Brasil', 'Rangkasbitung',
       'Mannheim, Baden-Württemberg, Germany', 'Connecticut'],
      dtype='object', length=9758)

In [6]:
cleaner = lambda text: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", text)

In [7]:
bios = users.set_index('login')['bio']
bios = bios.str.lower()
bios = bios.dropna()
print(bios.iloc[572])
bios = bios.apply(cleaner)
print(bios.iloc[572])
bios = bios.str.split().apply(' '.join)
print(bios.iloc[572])

professor at ifnmg, ph.d. in computational intelligence,  data science and machine intelligence enthusiast at minds/ufmg, researcher at @cidic-ifnmg 
professor at ifnmg  ph d  in computational intelligence   data science and machine intelligence enthusiast at minds ufmg  researcher at  cidic ifnmg 
professor at ifnmg ph d in computational intelligence data science and machine intelligence enthusiast at minds ufmg researcher at cidic ifnmg


In [8]:
bios

login
juankuquintana                       backend engineer at the kernel
josete89                     swift machine learning java javascript
stevejhkang                              interested in cloud spring
andru255                                                  developer
nylqd                                   done is better than perfect
                                        ...                        
kmanley                                                     loading
piraka9011                              roboticist i talk to robots
jurabek           software engineer mostly developing using open...
jikkujose         simulation hypothesis mindfulness decentraliza...
mouadziani        full stack software engineer working with lara...
Name: bio, Length: 33262, dtype: object

In [15]:
def check_occurences(texts, text):
    return texts.str.contains(text).sum()

In [16]:
c = CountVectorizer(max_features=500, ngram_range=(2,2), stop_words='english')

In [17]:
c.fit(bios)

In [18]:
c.get_feature_names_out()

array(['10 years', '15 years', '20 years', 'add bio', 'ai engineer',
       'ai enthusiast', 'ai machine', 'ai ml', 'ai research',
       'ai researcher', 'algorithm engineer', 'android dev',
       'android developer', 'android engineer', 'android ios',
       'android web', 'angular react', 'apaixonado por', 'app developer',
       'application developer', 'application development',
       'applied scientist', 'artificial intelligence', 'asp net',
       'assistant professor', 'associate professor', 'aws certified',
       'aws gcp', 'backend developer', 'backend engineer', 'better place',
       'big data', 'big fan', 'blockchain developer',
       'blockchain enthusiast', 'build stuff', 'build things',
       'building software', 'building things', 'building tools',
       'carnegie mellon', 'ceo founder', 'change world', 'cheap code',
       'chief technology', 'ci cd', 'clean code', 'cloud architect',
       'cloud computing', 'cloud engineer', 'cloud native',
       'complex pro

In [19]:
sorted(c.vocabulary_.items(), key=lambda x: -x[1])

[('years experience', 499),
 ('writing code', 498),
 ('write code', 497),
 ('world better', 496),
 ('work hard', 495),
 ('web technologies', 494),
 ('web software', 493),
 ('web services', 492),
 ('web performance', 491),
 ('web mobile', 490),
 ('web engineer', 489),
 ('web development', 488),
 ('web developer', 487),
 ('web dev', 486),
 ('web designer', 485),
 ('web apps', 484),
 ('web applications', 483),
 ('web application', 482),
 ('web app', 481),
 ('vue react', 480),
 ('vue js', 479),
 ('vp engineering', 478),
 ('vision machine', 477),
 ('vision engineer', 476),
 ('vision deep', 475),
 ('video games', 474),
 ('ux ui', 473),
 ('ux engineer', 472),
 ('ux designer', 471),
 ('user experience', 470),
 ('university technology', 469),
 ('ui ux', 468),
 ('ui engineer', 467),
 ('ui developer', 466),
 ('ui designer', 465),
 ('uc berkeley', 464),
 ('typescript react', 463),
 ('typescript node', 462),
 ('typescript javascript', 461),
 ('tsinghua university', 460),
 ('things web', 459),
 ('te

In [54]:
x = (
    check_occurences(bios, 'web developer'),
    check_occurences(bios, 'backend'),
    check_occurences(bios, 'front end'),
    check_occurences(bios, 'frontend'),
    check_occurences(bios, 'back end'),
    check_occurences(bios, 'php'),
    check_occurences(bios, 'nodejs'),
    check_occurences(bios, 'vuejs'),
    check_occurences(bios, 'django'),
    check_occurences(bios, 'web dev'),
    check_occurences(bios, 'laravel'),
    check_occurences(bios, 'wordpress'),
)
sum(x), x

(5452, (827, 373, 807, 571, 98, 703, 370, 114, 102, 1033, 302, 152))

In [21]:
x = (
    check_occurences(bios, 'ml'),
    check_occurences(bios, 'ai'),
    check_occurences(bios, 'machine learning'),
    check_occurences(bios, 'artificial intelligence'),
    check_occurences(bios, 'computer vision'),
    check_occurences(bios, 'natural language processing'),
    check_occurences(bios, 'nlp'),
    check_occurences(bios, 'deep learning'),
    check_occurences(bios, 'deeplearning'),
    check_occurences(bios, 'machinelearning'),
    check_occurences(bios, 'data mining'),
)
sum(x), x

(5813, (683, 2812, 964, 111, 353, 73, 300, 445, 22, 8, 42))

In [22]:
x = (
    check_occurences(bios, 'mobile'),
    check_occurences(bios, 'ios'),
    check_occurences(bios, 'android'),
    check_occurences(bios, 'swift'),
    check_occurences(bios, 'flutter'),
    check_occurences(bios, 'react native'),
    
)
sum(x), x

(2032, (387, 564, 504, 165, 158, 254))

In [23]:
x = (
    check_occurences(bios, 'data scientist'),
    
)
sum(x), x

(478, (478,))

In [24]:
x = (
    check_occurences(bios, 'data engineer'),
    check_occurences(bios, 'dataengineer'),
)
sum(x), x

(156, (154, 2))

In [25]:
(
    check_occurences(bios, 'software engineer'),
)

(3048,)

In [26]:
(
    check_occurences(bios, 'full stack'),
    check_occurences(bios, 'fullstack'),
)

(1329, 332)

In [35]:
x = (
    check_occurences(bios, 'devops'),
    check_occurences(bios,  'dev ops'),
    check_occurences(bios,  'kubernetes'),
    check_occurences(bios,  'docker'),
    check_occurences(bios,  'ci cd'),
    check_occurences(bios,  'terraform'),
    check_occurences(bios, 'cloud computing'),
)

sum(x), x


(891, (480, 11, 155, 160, 18, 29, 38))

In [29]:
(
    check_occurences(bios, 'game developer'),
    check_occurences(bios, 'unity'),
    check_occurences(bios, 'gamedev'),
)


(47, 219, 19)

In [67]:
bios[bios.str.contains(r'\bwordpress\b')].values

array(['mit grad 20 years senior developer focusing on e commerce react headless cms and web3 blockchain tech well versed in wordpress tech',
       'full stack web developer who in love with wordpress node js and react js',
       'designer developer and wordpress consultant',
       'wordpress developer since 2012',
       'trabalho com web h mais de 13 anos com experi ncia em back end e front end tenho grande experi ncia com desenvolvimento de sites ecommerce s wordpress',
       'javascript php and ruby developer working primarily with jquery wordpress bootstrap foundation angularjs and ruby on rails',
       'greg sweet here running a one man shop for website design development and maintenance custom classicpress wordpress themes and plugins',
       'remote sw engineer automattic react native wordpress deep learning django msc artificial intelligence organizer reactcanarias',
       'full stack web developer wordpress core contributor tv shows fanatic computer scientist',
       

In [38]:
classes = [
    'Web Developer', 
    'Mobile Developer', 
    'Machine Learning Engineer', 
    'Data Engineer', 
    'Data Scientist',
    'UI/UX Developer',
    'DevOps Engineer',
    'Academic',
    'Blockchain Developer'
]

In [39]:
', '.join(classes)

'Web Developer, Mobile Developer, Machine Learning Engineer, Data Engineer, Data Scientist, UI/UX Developer, DevOps Engineer, Academic, Blockchain Developer'

In [55]:
classes_expr = {
    classes[0]: ['web developer', 'backend', 'front end', 'frontend', 'back end', 'php', 'nodejs', 'vuejs', 'django', 'web dev', 'laravel', 'wordpress'],
    classes[1]: ['ios', 'android', 'swift', 'flutter', 'react native', 'dart', 'xamarin', 'ionic', 'nativescript'],
    classes[2]: ['ai', 'machine learning', 'artificial intelligence', 'computer vision', 'natural language processing', 'nlp', 'deep learning', 'deeplearning', 'machinelearning', 'data mining'],
    classes[3]: ['data engineer', 'dataengineer'],
    classes[4]: ['data scientist'],
    classes[5]: ['ui ux', 'ux designer'],
    classes[6]: ['devops', 'dev ops', 'docker', 'kubernetes', 'ci cd', 'terraform', 'cloud computing'],
    classes[7]: ['instructor', 'professor', 'research associate'],
    classes[8]: ['blockchain', 'ethereum', 'web3', 'bitcoin']
}

In [56]:
labels = pd.DataFrame(index=bios.index)
for class_, exprs in classes_expr.items():
    x = pd.Series(0, index=bios.index)
    for e in exprs:
        x = x + bios.str.contains(e).astype(int)
    
    x = (x > 0 ).astype(int)
    labels[class_] = x

In [57]:
labels.sum(axis=0)

Web Developer                3857
Mobile Developer             1438
Machine Learning Engineer    4368
Data Engineer                 156
Data Scientist                478
UI/UX Developer               112
DevOps Engineer               785
Academic                      267
Blockchain Developer          321
dtype: int64

In [58]:
labels

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer,DevOps Engineer,Academic,Blockchain Developer
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
juankuquintana,1,0,0,0,0,0,0,0,0
josete89,0,1,1,0,0,0,0,0,0
stevejhkang,0,0,0,0,0,0,0,0,0
andru255,0,0,0,0,0,0,0,0,0
nylqd,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
kmanley,0,0,0,0,0,0,0,0,0
piraka9011,0,0,0,0,0,0,0,0,0
jurabek,0,0,0,0,0,0,1,0,0
jikkujose,0,0,1,0,0,0,0,0,0


In [59]:
bios

login
juankuquintana                       backend engineer at the kernel
josete89                     swift machine learning java javascript
stevejhkang                              interested in cloud spring
andru255                                                  developer
nylqd                                   done is better than perfect
                                        ...                        
kmanley                                                     loading
piraka9011                              roboticist i talk to robots
jurabek           software engineer mostly developing using open...
jikkujose         simulation hypothesis mindfulness decentraliza...
mouadziani        full stack software engineer working with lara...
Name: bio, Length: 33262, dtype: object

In [60]:
labels.merge(bios, left_index=True, right_index=True)

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer,DevOps Engineer,Academic,Blockchain Developer,bio
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
007arunwilson,1,0,0,0,0,0,0,0,0,exploring js nodejs dev former php dev react n...
007jedgar,0,0,0,0,0,0,0,0,0,mobile web app dev
00Kai0,0,0,0,0,0,0,0,0,0,coding for fun currently working on game party...
00imvj00,0,0,0,0,0,0,0,0,0,engineer
0101011,0,0,0,0,0,0,0,0,0,software engineer
...,...,...,...,...,...,...,...,...,...,...
yysu,1,0,1,0,0,0,0,0,0,aws certified all 5 familiar with python conta...
yysu,1,0,1,0,0,0,0,0,0,aws certified all 5 familiar with python conta...
zhouzi,0,0,0,0,0,0,0,0,0,software engineer with a passion for user expe...
zhukovgreen,0,0,0,1,0,0,0,0,0,software and data engineer


In [61]:
labeled_users = labels[labels.sum(axis=1) > 0]

In [62]:
labeled_users

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer,DevOps Engineer,Academic,Blockchain Developer
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
juankuquintana,1,0,0,0,0,0,0,0,0
josete89,0,1,1,0,0,0,0,0,0
addappcn,0,1,0,0,0,0,0,0,0
mavisland,1,0,0,0,0,0,0,0,0
babakasse,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
danielpetrica,1,0,1,0,0,0,0,0,0
gulullu,1,0,0,0,0,0,0,0,0
jurabek,0,0,0,0,0,0,1,0,0
jikkujose,0,0,1,0,0,0,0,0,0


In [63]:
labeled_users.sum(axis=0)

Web Developer                3857
Mobile Developer             1438
Machine Learning Engineer    4368
Data Engineer                 156
Data Scientist                478
UI/UX Developer               112
DevOps Engineer               785
Academic                      267
Blockchain Developer          321
dtype: int64

In [64]:
labeled_users

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer,DevOps Engineer,Academic,Blockchain Developer
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
juankuquintana,1,0,0,0,0,0,0,0,0
josete89,0,1,1,0,0,0,0,0,0
addappcn,0,1,0,0,0,0,0,0,0
mavisland,1,0,0,0,0,0,0,0,0
babakasse,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
danielpetrica,1,0,1,0,0,0,0,0,0
gulullu,1,0,0,0,0,0,0,0,0
jurabek,0,0,0,0,0,0,1,0,0
jikkujose,0,0,1,0,0,0,0,0,0


In [65]:
labeled_users = labeled_users[~labeled_users.index.duplicated(keep='first')]

In [66]:
labeled_users

Unnamed: 0_level_0,Web Developer,Mobile Developer,Machine Learning Engineer,Data Engineer,Data Scientist,UI/UX Developer,DevOps Engineer,Academic,Blockchain Developer
login,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
juankuquintana,1,0,0,0,0,0,0,0,0
josete89,0,1,1,0,0,0,0,0,0
addappcn,0,1,0,0,0,0,0,0,0
mavisland,1,0,0,0,0,0,0,0,0
babakasse,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
jiangzhongkai,1,0,1,0,0,0,0,0,0
fredwu,0,0,1,0,0,0,0,0,0
danielpetrica,1,0,1,0,0,0,0,0,0
gulullu,1,0,0,0,0,0,0,0,0


In [54]:
labeled_users.to_csv(cfg.USER_LABELS_FILE)

In [124]:
label_counts = labeled_users.sum(axis=1)
label_counts[label_counts == 1]

login
juankuquintana    1
addappcn          1
mavisland         1
babakasse         1
jlcallalle        1
                 ..
tak1n             1
fredwu            1
gulullu           1
jurabek           1
jikkujose         1
Length: 8546, dtype: int64

In [56]:
label_counts[label_counts == 2]

login
josete89         2
justmeshishir    2
iwitaly          2
mraible          2
Concert0         2
                ..
flyingant        2
mhink            2
mikevocalz       2
jiangzhongkai    2
danielpetrica    2
Length: 794, dtype: int64

In [57]:
label_counts[label_counts == 3]

login
MinDBreaK           3
liangfeidotme       3
hjJunior            3
EvertonTomalok      3
Hamza5              3
spidergears         3
dimpurr             3
megatux             3
nishankbhati        3
abinj               3
peterfei            3
ksopyla             3
Anwesh43            3
sebastianconcept    3
Aries0d0f           3
Bharathbrothers     3
dgfigueroa29        3
ankurp              3
chikaobuah          3
jasonsaayman        3
mrhieu              3
nimomeng            3
ErickPetru          3
ramshid             3
arora-72            3
fancyfrees          3
TejasBhalerao       3
itinance            3
Muhammad7Salah      3
joaodaher           3
tgrrr               3
rahulkumaran        3
Froyo91             3
dtype: int64

In [58]:
bios.loc[label_counts[label_counts == 4].index]

login
VakinduPhilliam    node js backend frontend developer react ui ux...
ssekuwanda         pythonista data scientist ml ai django web dev...
Name: bio, dtype: object

In [27]:
## original labeled: 8871