In [1]:
# Using the GitHub API, to scrape users and their public repositories.

In [87]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from tqdm import tqdm

In [88]:
load_dotenv()

True

In [89]:
location = "Beijing"
min_followers = 500

# Fetch Users with Follower count condition

In [90]:
# This function reads Link header from response to navigate next page url

def get_next_page_url(response):
    link_header = response.headers.get('Link', '')
    # print(f"{link_header=}")
    next_page_url = None
    for x in link_header.split(","):
        match = re.search(r'<(.*)>; rel="next"', x)
        # print(match)
        if match:
            next_page_url = match.group(1)

    return next_page_url

In [91]:
# Fetch all users from location with min_followers count
url = f"https://api.github.com/search/users?q=location:{location}+followers:>{min_followers}&per_page=100"
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {os.getenv("GITHUB_TOKEN")}"}

users = []
while True:
    print(url)
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error fetching users: {response.text}")
        break
    
    data = response.json()
    users.extend(data["items"])
    next_page_url = get_next_page_url(response)
    print("Next:", next_page_url)
    if next_page_url is None:
        break

    url = next_page_url

len(users)


https://api.github.com/search/users?q=location:Beijing+followers:>500&per_page=100
Next: https://api.github.com/search/users?q=location%3ABeijing+followers%3A%3E500&per_page=100&page=2
https://api.github.com/search/users?q=location%3ABeijing+followers%3A%3E500&per_page=100&page=2
Next: https://api.github.com/search/users?q=location%3ABeijing+followers%3A%3E500&per_page=100&page=3
https://api.github.com/search/users?q=location%3ABeijing+followers%3A%3E500&per_page=100&page=3
Next: https://api.github.com/search/users?q=location%3ABeijing+followers%3A%3E500&per_page=100&page=4
https://api.github.com/search/users?q=location%3ABeijing+followers%3A%3E500&per_page=100&page=4
Next: None


360

In [94]:
# Save users data to json file
with open("users.json", "w") as f:
    json.dump(users, f)

In [95]:
df_users = pd.DataFrame(users)

In [96]:
df_users.head()

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,starred_url,subscriptions_url,organizations_url,repos_url,events_url,received_events_url,type,user_view_type,site_admin,score
0,michaelliao,470058,MDQ6VXNlcjQ3MDA1OA==,https://avatars.githubusercontent.com/u/470058...,,https://api.github.com/users/michaelliao,https://github.com/michaelliao,https://api.github.com/users/michaelliao/follo...,https://api.github.com/users/michaelliao/follo...,https://api.github.com/users/michaelliao/gists...,https://api.github.com/users/michaelliao/starr...,https://api.github.com/users/michaelliao/subsc...,https://api.github.com/users/michaelliao/orgs,https://api.github.com/users/michaelliao/repos,https://api.github.com/users/michaelliao/event...,https://api.github.com/users/michaelliao/recei...,User,public,False,1.0
1,daimajia,2503423,MDQ6VXNlcjI1MDM0MjM=,https://avatars.githubusercontent.com/u/250342...,,https://api.github.com/users/daimajia,https://github.com/daimajia,https://api.github.com/users/daimajia/followers,https://api.github.com/users/daimajia/followin...,https://api.github.com/users/daimajia/gists{/g...,https://api.github.com/users/daimajia/starred{...,https://api.github.com/users/daimajia/subscrip...,https://api.github.com/users/daimajia/orgs,https://api.github.com/users/daimajia/repos,https://api.github.com/users/daimajia/events{/...,https://api.github.com/users/daimajia/received...,User,public,False,1.0
2,xiaolai,152970,MDQ6VXNlcjE1Mjk3MA==,https://avatars.githubusercontent.com/u/152970...,,https://api.github.com/users/xiaolai,https://github.com/xiaolai,https://api.github.com/users/xiaolai/followers,https://api.github.com/users/xiaolai/following...,https://api.github.com/users/xiaolai/gists{/gi...,https://api.github.com/users/xiaolai/starred{/...,https://api.github.com/users/xiaolai/subscript...,https://api.github.com/users/xiaolai/orgs,https://api.github.com/users/xiaolai/repos,https://api.github.com/users/xiaolai/events{/p...,https://api.github.com/users/xiaolai/received_...,User,public,False,1.0
3,draveness,6493255,MDQ6VXNlcjY0OTMyNTU=,https://avatars.githubusercontent.com/u/649325...,,https://api.github.com/users/draveness,https://github.com/draveness,https://api.github.com/users/draveness/followers,https://api.github.com/users/draveness/followi...,https://api.github.com/users/draveness/gists{/...,https://api.github.com/users/draveness/starred...,https://api.github.com/users/draveness/subscri...,https://api.github.com/users/draveness/orgs,https://api.github.com/users/draveness/repos,https://api.github.com/users/draveness/events{...,https://api.github.com/users/draveness/receive...,User,public,False,1.0
4,hongyangAndroid,10704521,MDQ6VXNlcjEwNzA0NTIx,https://avatars.githubusercontent.com/u/107045...,,https://api.github.com/users/hongyangAndroid,https://github.com/hongyangAndroid,https://api.github.com/users/hongyangAndroid/f...,https://api.github.com/users/hongyangAndroid/f...,https://api.github.com/users/hongyangAndroid/g...,https://api.github.com/users/hongyangAndroid/s...,https://api.github.com/users/hongyangAndroid/s...,https://api.github.com/users/hongyangAndroid/orgs,https://api.github.com/users/hongyangAndroid/r...,https://api.github.com/users/hongyangAndroid/e...,https://api.github.com/users/hongyangAndroid/r...,User,public,False,1.0


In [97]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   login                360 non-null    object 
 1   id                   360 non-null    int64  
 2   node_id              360 non-null    object 
 3   avatar_url           360 non-null    object 
 4   gravatar_id          360 non-null    object 
 5   url                  360 non-null    object 
 6   html_url             360 non-null    object 
 7   followers_url        360 non-null    object 
 8   following_url        360 non-null    object 
 9   gists_url            360 non-null    object 
 10  starred_url          360 non-null    object 
 11  subscriptions_url    360 non-null    object 
 12  organizations_url    360 non-null    object 
 13  repos_url            360 non-null    object 
 14  events_url           360 non-null    object 
 15  received_events_url  360 non-null    obj

In [98]:
df_users.isna().sum()

login                  0
id                     0
node_id                0
avatar_url             0
gravatar_id            0
url                    0
html_url               0
followers_url          0
following_url          0
gists_url              0
starred_url            0
subscriptions_url      0
organizations_url      0
repos_url              0
events_url             0
received_events_url    0
type                   0
user_view_type         0
site_admin             0
score                  0
dtype: int64

## Fetch Repositories

In [99]:
# Fetch all public repositories of all above users
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {os.getenv("GITHUB_TOKEN")}"}

repositories = []

for i, row in df_users.iterrows():
    username = row['login']

    url = f"https://api.github.com/users/{username}/repos"
    while True:
        print(url)
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching users: {response.text}")
            break
        
        data = response.json()
        repositories.extend(data)
        next_page_url = get_next_page_url(response)
        print("Next:", next_page_url)
        if next_page_url is None:
            break
        url = next_page_url
    
len(repositories)


https://api.github.com/users/michaelliao/repos
Next: https://api.github.com/user/470058/repos?page=2
https://api.github.com/user/470058/repos?page=2
Next: https://api.github.com/user/470058/repos?page=3
https://api.github.com/user/470058/repos?page=3
Next: https://api.github.com/user/470058/repos?page=4
https://api.github.com/user/470058/repos?page=4
Next: None
https://api.github.com/users/daimajia/repos
Next: https://api.github.com/user/2503423/repos?page=2
https://api.github.com/user/2503423/repos?page=2
Next: https://api.github.com/user/2503423/repos?page=3
https://api.github.com/user/2503423/repos?page=3
Next: None
https://api.github.com/users/xiaolai/repos
Next: https://api.github.com/user/152970/repos?page=2
https://api.github.com/user/152970/repos?page=2
Next: None
https://api.github.com/users/draveness/repos
Next: https://api.github.com/user/6493255/repos?page=2
https://api.github.com/user/6493255/repos?page=2
Next: None
https://api.github.com/users/hongyangAndroid/repos
Next: 

32358

In [100]:
len(repositories)

32358

In [101]:
# save repositories data to json file
with open("repositories.json", "w") as f:
    json.dump(repositories, f)

In [102]:
len(users),len(repositories)

(360, 32358)

# Fetch user profiles

In [103]:
# Fetch user profiles 
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {os.getenv("GITHUB_TOKEN")}"}
user_profiles = []
for i, row in tqdm(df_users.iterrows()):
    username = row['login']
    url = f"https://api.github.com/users/{username}"
    # print(url)
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error fetching users: {response.text}")
        break
    
    user_profiles.append(response.json())
    
len(user_profiles)


360it [02:31,  2.38it/s]


360

In [104]:
# Save user profiles to JSON file
with open("user_profiles.json", "w") as f:
    json.dump(user_profiles, f)

# Load data back again from JSON files

In [105]:
with open("users.json") as f:
    users = json.load(f)

In [106]:
with open("repositories.json") as f:
    repositories = json.load(f)

In [107]:
with open("user_profiles.json") as f:
    user_profiles = json.load(f)

In [108]:
df_users = pd.DataFrame(users)
df_user_profiles = pd.DataFrame(user_profiles)
df_repositories = pd.DataFrame(repositories)

## Users data cleaning

 * login: Their Github user ID
 * name: Their full name
 * company: The company they work at. Clean up company names. At least make sure:
   * They're trimmed of whitespace
   * Leading @ symbols are stripped
   * They are converted to UPPERCASE

 * location: The city they are in
 * email: Their email address
 * hireable: Whether they are open to being hired
 * bio: A short bio about them
 * public_repos: The number of public repositories they have
 * followers: The number of followers they have
 * following: The number of people they are following
 * created_at: When they joined Github


```
user.login
profile.name
profile.company -> re.sub(r'(^@)|(\s@)', ' ', companies).strip().upper()
profile.location
profile.email
profile.hireable -> map {True: 'true', False: 'false', None: ''}
profile.bio
profile.public_repos
profile.followers
profile.following
profile.created_at
```

In [109]:
df_profiles = pd.DataFrame(user_profiles)

In [110]:
df_user_profiles = df_profiles[["login", "name", "company", "location", "email", "hireable", "bio", "public_repos", "followers", "following", "created_at"]].fillna('')
df_user_profiles["company"] = df_user_profiles["company"].map(lambda x: re.sub(r'(^@)|(\s@)', ' ', x).strip().upper())
# df_user_profiles.loc[df_user_profiles["hireable"] == True, "hireable"] = 'true'
# df_user_profiles.loc[df_user_profiles["hireable"] == False, "hireable"] = 'false'
# df_user_profiles.info()

df_user_profiles.head()


Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,michaelliao,Crypto Michael,,"Beijing, China",askxuefeng@gmail.com,,Crypto developer.,99,37254,3,2010-11-06T12:21:35Z
1,daimajia,代码家,ZHENFUND BEIJING,"Beijing, China",daimajia@gmail.com,,Zhenfund VP of Investment.,89,24631,271,2012-10-07T02:40:06Z
2,xiaolai,xiaolai,INBLOCKCHAIN,beijing,lixiaolai@gmail.com,,A lifelong student.,55,19268,37,2009-11-13T18:29:42Z
3,draveness,Draven,SPECTRA-FUND,"Beijing, China",i@draven.co,True,HFT / C++ / Go,50,13010,28,2014-01-24T16:22:01Z
4,hongyangAndroid,张鸿洋,WANANDROID.COM,"Beijing,China",623565791@qq.com,True,学习ing,102,12986,35,2015-01-26T07:05:45Z


In [111]:
df_user_profiles.to_csv("users.csv", index=False)

## Repositories data cleaning

For each user in users.csv, fetch up to the 500 most recently pushed repositories, with fields:

```
login: The Github user ID (login) of the owner, which, BTW, is not directly in the API response.)

full_name: Full name of the repository

created_at: When the repository was created

stargazers_count: Number of stars the repository has

watchers_count: Number of watchers the repository has

language: The programming language the repository is written in

has_projects: Whether the repository has projects enabled
  -> map {True: 'true', False: 'false', None: ''}

has_wiki: Whether the repository has a wiki
  -> map {True: 'true', False: 'false', None: ''}

license_name: Name of the license the repository is under (This is under license.key)
```

In [112]:
df_repositories = pd.DataFrame(repositories)

In [113]:
df_all_repos = df_repositories.copy()
df_all_repos["license_name"] = df_all_repos["license"].map(lambda x: x["key"] if x else None)
df_all_repos["login"] = df_all_repos["owner"].map(lambda x: x["login"])
df_all_repos = df_all_repos[["login", "full_name", "created_at", "pushed_at",
                             "stargazers_count", "watchers_count", "language", "has_projects", "has_wiki", "license_name"]]

#df_all_repos.loc[df_all_repos["has_projects"] == True, "has_projects"] = 'true'
#df_all_repos.loc[df_all_repos["has_projects"] == False, "has_projects"] = 'false'
#df_all_repos.loc[df_all_repos["has_wiki"] == True, "has_wiki"] = 'true'
#df_all_repos.loc[df_all_repos["has_wiki"] == False, "has_wiki"] = 'false'

# df_all_repos = df_all_repos.fillna('')
df_all_repos.info()
# df_all_repos.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32358 entries, 0 to 32357
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   login             32358 non-null  object
 1   full_name         32358 non-null  object
 2   created_at        32358 non-null  object
 3   pushed_at         32349 non-null  object
 4   stargazers_count  32358 non-null  int64 
 5   watchers_count    32358 non-null  int64 
 6   language          22367 non-null  object
 7   has_projects      32358 non-null  bool  
 8   has_wiki          32358 non-null  bool  
 9   license_name      17616 non-null  object
dtypes: bool(2), int64(2), object(6)
memory usage: 2.0+ MB


In [114]:
for g, df in df_all_repos.groupby("login"):
    prevlen, nextlen = len(df), len(df.sort_values("pushed_at", ascending=False).head(500))
    if(prevlen != nextlen):
        print(g, prevlen, nextlen)

LinuxSuRen 1842 500
hailiang-wang 1058 500
i5ting 902 500
wanglong 610 500
xiaoweiruby 860 500


In [115]:
df_all_repos = (
    df_all_repos.groupby("login")
    .apply(lambda df: df.sort_values("pushed_at").head(500),
           include_groups=True
          )
    .reset_index(drop=True)
    .drop(columns=["pushed_at"])
)

df_all_repos.sample(10)

  .apply(lambda df: df.sort_values("pushed_at").head(500),


Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
5225,Vonng,Vonng/blog,2017-12-13T05:20:10Z,7,7,Jupyter Notebook,True,True,
6270,ZhaoKaiQiang,ZhaoKaiQiang/PagerSlidingTabStrip,2014-10-21T09:46:00Z,0,0,,True,True,
2305,JasinYip,JasinYip/babel,2022-02-16T11:14:11Z,0,0,TypeScript,True,False,mit
26977,xiaoweiruby,xiaoweiruby/vue-music-player,2018-04-04T01:55:35Z,0,0,Vue,True,True,
29445,zswang,zswang/h5i18n-php,2017-07-04T13:44:34Z,0,0,PHP,True,True,
16215,liubin,liubin/ConsoleMonitor,2013-11-09T03:37:25Z,0,0,Java,True,True,
6409,Zheaoli,Zheaoli/recursion,2017-12-06T10:24:29Z,0,0,Jupyter Notebook,True,True,
27182,xiaye13579,xiaye13579/xUtils,2015-04-28T02:09:18Z,0,0,Java,True,True,
8975,cundong,cundong/blog-1,2015-05-07T09:07:39Z,0,0,,True,True,apache-2.0
21245,rfyiamcool,rfyiamcool/python-pid,2015-09-16T16:07:02Z,5,5,Python,True,True,mit


In [116]:
# Write to csv
df_all_repos.to_csv("repositories.csv", index=False)


In [117]:
df_all_repos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29586 entries, 0 to 29585
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   login             29586 non-null  object
 1   full_name         29586 non-null  object
 2   created_at        29586 non-null  object
 3   stargazers_count  29586 non-null  int64 
 4   watchers_count    29586 non-null  int64 
 5   language          21105 non-null  object
 6   has_projects      29586 non-null  bool  
 7   has_wiki          29586 non-null  bool  
 8   license_name      15854 non-null  object
dtypes: bool(2), int64(2), object(5)
memory usage: 1.6+ MB


## Analysis

In [118]:
# 1. Who are the top 5 users in Beijing with the highest number of followers? List their login in order, comma-separated.

df_filtered = df_user_profiles.sort_values("followers", ascending=False).head(5)
users_list = df_filtered["login"].values
",".join(users_list)
# df_filtered

'michaelliao,daimajia,xiaolai,draveness,hongyangAndroid'

In [119]:
# 2. Who are the 5 earliest registered GitHub users in Beijing? List their login in ascending order of created_at, comma-separated.

df_filtered = df_user_profiles.sort_values("created_at", ascending=True).head(5)
users_list = df_filtered["login"].values
",".join(users_list)
# df_filtered

'robin,nwind,reeze,kejun,ZhangHanDong'

In [120]:
# 3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.


df_filtered = df_all_repos[df_all_repos.license_name != ''].license_name.value_counts().head(3).reset_index()
license_list = df_filtered.license_name.values
",".join(license_list)


'mit,apache-2.0,other'

In [121]:
# 4. Which company do the majority of these developers work at?


df_user_profiles[df_user_profiles.company != ''].value_counts("company", ascending=False).head()

company
BYTEDANCE              12
ALIBABA                 9
TSINGHUA UNIVERSITY     7
PEKING UNIVERSITY       7
TENCENT                 6
Name: count, dtype: int64

In [122]:
# 5. Which programming language is most popular among these users?

df_all_repos.value_counts("language", ascending=False).head()

language
JavaScript    4422
Python        3259
Java          2368
Go            1365
C++           1214
Name: count, dtype: int64

In [123]:
# 6. Which programming language is the second most popular among users who joined after 2020?

selected_users = df_user_profiles[pd.to_datetime(df_user_profiles.created_at).dt.year >= 2020].login

df_all_repos[df_all_repos.login.isin(selected_users)].value_counts("language", ascending=False).head()



language
Python              32
JavaScript          24
HTML                19
TypeScript          16
Jupyter Notebook    12
Name: count, dtype: int64

In [124]:
# 7. Which language has the highest average number of stars per repository?

df_all_repos.groupby("language")["stargazers_count"].mean().reset_index().sort_values("stargazers_count", ascending=False).head(2)

Unnamed: 0,language,stargazers_count
52,Jinja,3431.0
114,Solidity,1290.555556


In [125]:
# 8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength?
# List their login in order, comma-separated.

df_leader = df_user_profiles.copy()
df_leader.head()
# leader_strength as followers / (1 + following).
df_leader["leader_strength"] = df_leader["followers"] / (1 + df_leader["following"])

In [126]:
df_filtered = df_leader.sort_values("leader_strength", ascending=False)[["login", "leader_strength"]].head()
login_list = df_filtered.login.values
",".join(login_list)

'michaelliao,ityouknow,liuhuanyong,thunlp,shenghy'

In [127]:
# 9. What is the correlation between the number of followers and the number of public repositories among users in Beijing?
# Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

df_user_profiles[["public_repos", "followers"]].corr()

Unnamed: 0,public_repos,followers
public_repos,1.0,0.032918
followers,0.032918,1.0


In [128]:
df_user_profiles.head(2)

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,michaelliao,Crypto Michael,,"Beijing, China",askxuefeng@gmail.com,,Crypto developer.,99,37254,3,2010-11-06T12:21:35Z
1,daimajia,代码家,ZHENFUND BEIJING,"Beijing, China",daimajia@gmail.com,,Zhenfund VP of Investment.,89,24631,271,2012-10-07T02:40:06Z


In [129]:

# 10. Does creating more repos help users get more followers? Using regression, 
# estimate how many additional followers a user gets per additional public repository.
# regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

from sklearn.linear_model import LinearRegression

In [130]:
X = df_user_profiles[["public_repos"]]
y = df_user_profiles["followers"]
model = LinearRegression()
model.fit(X, y)

In [131]:
np.round(model.coef_, 3), model.intercept_

(array([0.657]), 1591.3537784716464)

In [132]:
# 11. Do people typically enable projects and wikis together? 
# What is the correlation between a repo having projects enabled and having wiki enabled?

df2 = df_all_repos[["has_projects", "has_wiki"]].copy()
#df2["has_projects"] = df2["has_projects"].map(lambda x: True if x == 'true' else False if x == 'false' else np.nan)
#df2["has_wiki"] = df2["has_projects"].map(lambda x: True if x == 'true' else False if x == 'false' else np.nan)
df2.dtypes
print(np.round(df2.corr().iloc[0,1] ,3))
df2.corr()
# df2.corr().iloc[0,1]

0.272


Unnamed: 0,has_projects,has_wiki
has_projects,1.0,0.272323
has_wiki,0.272323,1.0


In [133]:
df2.dtypes

has_projects    bool
has_wiki        bool
dtype: object

In [134]:
# 12. Do hireable users follow more people than those who are not hireable?
# Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [135]:
df_user_profiles.head(2)

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,michaelliao,Crypto Michael,,"Beijing, China",askxuefeng@gmail.com,,Crypto developer.,99,37254,3,2010-11-06T12:21:35Z
1,daimajia,代码家,ZHENFUND BEIJING,"Beijing, China",daimajia@gmail.com,,Zhenfund VP of Investment.,89,24631,271,2012-10-07T02:40:06Z


In [136]:
hireable_mean_followers = df_user_profiles[df_user_profiles.hireable == True].following.mean()
non_hireable_mean_followers = df_user_profiles[df_user_profiles.hireable != True].following.mean()
hireable_mean_followers, non_hireable_mean_followers, np.round(hireable_mean_followers - non_hireable_mean_followers, 3)

(267.7029702970297, 118.1891891891892, 149.514)

In [137]:
df3.head(3)

Unnamed: 0,bio,followers,bio_word_count
0,Crypto developer.,37255,2
1,Zhenfund VP of Investment.,24631,4
2,A lifelong student.,19265,3


In [138]:
# 13. Some developers write long bios. Does that help them get more followers? 
# What's the correlation of the length of their bio (in Unicode words, split by whitespace) with followers?
# (Ignore people without bios)
# Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

# Filter user with empty bio
df3 = df_user_profiles[df_user_profiles.bio.str.strip().str.len() > 0][["bio", "followers"]].copy()
# Count words in bio
df3["bio_word_count"] = df3.bio.str.split().str.len()
X = df3[["bio_word_count"]]
y = df3["followers"]

model = LinearRegression()
model.fit(X, y)
np.round(model.intercept_, 3), np.round(model.coef_, 3)
#, np.round(df3[["bio_word_count", "followers"]].corr(), 3)

(1865.047, array([-12.112]))

In [139]:
# m,b = np.polyfit(df3["followers"], df3["bio_word_count"], 1)
m,b = np.polyfit(df3["bio_word_count"], df3["followers"], 1)
np.round(m, 3), b

(-12.112, 1865.0467414836678)

In [140]:
# Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated
# Users login


df4 = df_all_repos.copy()
df4["created_at"] = pd.to_datetime(df4.created_at)
results = df4[(df4.created_at.dt.dayofweek.isin([5,6]))].groupby("login")["created_at"].count().sort_values(ascending=False).head(5).reset_index()
",".join(results["login"].values)


'LinuxSuRen,zhufengnodejs,xiaoweiruby,i5ting,mozillazg'

In [141]:
# 15. Do people who are hireable share their email addresses more often?
# [fraction of users with email when hireable=true] minus [fraction of users with email for the rest] 
# (to 3 decimal places, e.g. 0.123 or -0.123)

df_hireable = df_user_profiles[df_user_profiles.hireable == True]
df_non_hireable = df_user_profiles[df_user_profiles.hireable != True]

hireable_fraction = df_hireable[df_hireable.email != ''].shape[0] / df_hireable.shape[0]
non_hireable_fraction = df_non_hireable[df_non_hireable.email != ''].shape[0] / df_non_hireable.shape[0]

np.round(hireable_fraction - non_hireable_fraction, 3)

0.063

In [142]:
df_user_profiles.head(2)

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,michaelliao,Crypto Michael,,"Beijing, China",askxuefeng@gmail.com,,Crypto developer.,99,37254,3,2010-11-06T12:21:35Z
1,daimajia,代码家,ZHENFUND BEIJING,"Beijing, China",daimajia@gmail.com,,Zhenfund VP of Investment.,89,24631,271,2012-10-07T02:40:06Z


In [143]:
# 16. Let's assume that the last word in a user's name is their surname 
# (ignore missing names, trim and split by whitespace.) 
# What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
# Number of users with the most common surname

df5 = df_user_profiles[df_user_profiles.name != ''].copy()
df5.name.str.split(" ").str[-1].str.strip().value_counts().head(2)

name
Zhang    11
Wang      8
Name: count, dtype: int64