In [13]:
import pandas as pd
import statsmodels.api as sm

users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

In [2]:
# Sort by 'followers' in descending order
top_users = users_df.sort_values(by='followers', ascending=False).head(5)

# Get the 'login' of the top 5 users as a comma-separated string
top_5_logins = ", ".join(top_users['login'].tolist())
print("Top 5 users by followers:", top_5_logins)

Top 5 users by followers: dennybritz, wasabeef, dai-shi, rui314, domenic


In [3]:
# 2. 5 earliest registered GitHub users in Tokyo
earliest_users_five_tokyo = users_df.sort_values(by='created_at', ascending=True).head(5)['login'].tolist()
print("Ans 2: 5 earliest registered GitHub users:", ", ".join(earliest_users_five_tokyo))

Ans 2: 5 earliest registered GitHub users: kana, kakutani, mootoh, lhl, walf443


In [4]:
# 3. 3 most popular licenses among these users
repos_with_license = repos_df[repos_df['license_name'].notnull()]
top_3_licenses = repos_with_license['license_name'].value_counts().head(3).index.tolist()
print("Ans 3 : Top 3 most popular licenses:", ", ".join(top_3_licenses))

Ans 3 : Top 3 most popular licenses: mit, apache-2.0, other


In [5]:
# 4. Which company do the majority of these developers work at?
users_with_company = users_df[users_df['company'].notnull()].copy()  
users_with_company.loc[:, 'company'] = users_with_company['company'].str.strip().str.lstrip('@').str.upper()
most_common_company = users_with_company['company'].value_counts().idxmax()
print("Majority of these developers work at:", most_common_company)

Majority of these developers work at: GOOGLE


In [6]:
# 5. Which programming language is most popular among these users?
repos_with_language = repos_df[repos_df['language'].notnull()]
most_common_language = repos_with_language['language'].value_counts().idxmax()
print("Ans 5 : Most popular programming language:", most_common_language)

Ans 5 : Most popular programming language: JavaScript


In [7]:
# 6. Which programming language is the second most popular among users who joined after 2020?
users_df['created_at'] = pd.to_datetime(users_df['created_at'], utc=True)  
comparison_date = pd.to_datetime('2020-01-01').tz_localize('UTC')
users_after_2020 = users_df[users_df['created_at'] > comparison_date]
second_most_common_language = repos_df[repos_df['language'].notnull() & repos_df['login'].isin(users_after_2020['login'])]['language'].value_counts().nlargest(2).idxmin()
print("Ans 6 : Second most popular language among users who joined after 2020:", second_most_common_language)

Ans 6 : Second most popular language among users who joined after 2020: Rust


In [8]:
# 7. Which language has the highest average number of stars per repository?
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print("Ans 7 : Language with the highest average number of stars per repository:", avg_stars_per_language)

Ans 7 : Language with the highest average number of stars per repository: Assembly


In [9]:
# 8. Define leader_strength as followers / (1 + following). Top 5 in terms of leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.sort_values(by='leader_strength', ascending=False).head(5)['login'].tolist()
print("Ans 8 : Top 5 users in terms of leader_strength:", ", ".join(top_5_leader_strength))

Ans 8 : Top 5 users in terms of leader_strength: blueimp, dai-shi, asahilina, pilcrowonpaper, marcan


In [12]:
# 9. Correlation between the number of followers and the number of public repositories
correlation = users_df['followers'].corr(users_df['public_repos'])
print("Ans 9 : Correlation between followers and repos:", round(correlation, 3))

Ans 9 : Correlation between followers and repos: 0.05


In [14]:
# 10. Regression to estimate additional followers per additional public repository
X = users_df['public_repos']
y = users_df['followers']
X = sm.add_constant(X) 
model = sm.OLS(y, X).fit()
slope = model.params.iloc[1] 
print("Ans 10 : Regression slope of followers on repos:", round(slope, 3))

Ans 10 : Regression slope of followers on repos: 0.272


In [15]:
# 11. Correlation between projects_enabled and wiki_enabled
correlation_projects_wiki = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))
print("Ans 11 : Correlation between projects_enabled and wiki_enabled:", round(correlation_projects_wiki, 3))

Ans 11 : Correlation between projects_enabled and wiki_enabled: 0.38


In [16]:
# 12. Average following difference for hireable users
average_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
average_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()
following_difference = average_following_hireable - average_following_non_hireable
print(f"Ans 12: Average following difference (hireable - non-hireable): {following_difference:.3f}")

Ans 12: Average following difference (hireable - non-hireable): nan


In [17]:
# 13. Regression slope of followers on bio length
users_df['bio_length'] = users_df['bio'].str.len() 
bio_length_correlation = users_df[users_df['bio'].notna()]['bio_length'].corr(users_df['followers'])
print(f"Ans 13 : Correlation of bio length with followers: {bio_length_correlation:.3f}")

Ans 13 : Correlation of bio length with followers: 0.087


In [18]:
filtered_users_df = users_df.dropna(subset=['followers', 'bio_length'])

y = filtered_users_df['followers']
X = sm.add_constant(filtered_users_df['bio_length'])

bio_model = sm.OLS(y, X).fit()
print(bio_model.summary())

                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     2.910
Date:                Thu, 31 Oct 2024   Prob (F-statistic):             0.0889
Time:                        12:23:45   Log-Likelihood:                -3170.0
No. Observations:                 380   AIC:                             6344.
Df Residuals:                     378   BIC:                             6352.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        538.6199     90.320      5.963      0.0

In [19]:
# 14. Users who created the most repositories on weekends (UTC)
repos_df['created_at'] = pd.to_datetime(repos_df['created_at']) 
repos_df['created_on_weekend'] = repos_df['created_at'].dt.dayofweek >= 5  
weekend_repos = repos_df[repos_df['created_on_weekend']]
top_users_weekend = weekend_repos['login'].value_counts().head(5).index.tolist()
print(f"Ans 14 : Top 5 users who created most repositories on weekends: {', '.join(top_users_weekend)}")

Ans 14 : Top 5 users who created most repositories on weekends: qnighy, h6ah4i, takahashim, suzuki-shunsuke, kevincobain2000


In [20]:
# 15. Difference in email fractions for hireable and non-hireable users
fraction_email_hireable = users_df[users_df['hireable'] == True]['email'].notna().mean()
fraction_email_non_hireable = users_df[users_df['hireable'] == False]['email'].notna().mean()
email_fraction_difference = fraction_email_hireable - fraction_email_non_hireable
print(f"Ans 15 : Difference in email fractions (hireable - non-hireable): {email_fraction_difference:.3f}")

Ans 15 : Difference in email fractions (hireable - non-hireable): nan


In [21]:

# 16. Most common surname
users_df['surname'] = users_df['name'].str.split().str[-1]
common_surnames = users_df['surname'].value_counts()
most_common_surname = common_surnames.idxmax()
most_common_surname_count = common_surnames.max()
print(f"Ans 16 : Most common surname: {most_common_surname} with {most_common_surname_count} users")

Ans 16 : Most common surname: Tanaka with 5 users
