## **Creating user and repo files**

In [None]:
import requests
import pandas as pd

# GitHub API base URL
GITHUB_API_URL = "https://api.github.com"
# Personal access token to authenticate (replace with your token)
TOKEN = "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"

# Headers for GitHub API requests
headers = {
    "Authorization": f"token {TOKEN}"
}

# Function to get users in Beijing with over 500 followers
def get_users_from_beijing():
    users = []
    url = f"{GITHUB_API_URL}/search/users?q=location:Beijing+followers:>500&per_page=100"

    while url:
        response = requests.get(url, headers=headers)
        data = response.json()
        users.extend(data['items'])
        # Check if there's a 'next' page
        url = response.links.get('next', {}).get('url')

    return users

# Function to get user details (especially created_at)
def get_user_details(login):
    user_url = f"{GITHUB_API_URL}/users/{login}"
    response = requests.get(user_url, headers=headers)
    return response.json()

# Function to get repositories for a user
def get_user_repositories(login):
    repos = []
    repo_url = f"{GITHUB_API_URL}/users/{login}/repos?per_page=100&type=public"
    while repo_url:
        response = requests.get(repo_url, headers=headers)
        repos.extend(response.json())
        repo_url = response.links.get('next', {}).get('url')
    return repos[:500]  # Limit to the 500 most recent repositories

# Main function to fetch users and export to CSV
def main():
    # Get users from Beijing with > 500 followers
    users = get_users_from_beijing()
    user_details = []
    repo_details = []

    # Fetch detailed info for each user
    for user in users:
        print(f"Processing user: {user['login']}")
        details = get_user_details(user['login'])

        # Add raw user data to user_details without any cleaning
        user_details.append({
            'login': details['login'],
            'name': details.get('name', 'N/A'),
            'company': details.get('company', 'N/A'),
            'location': details.get('location', 'N/A'),
            'email': details.get('email', 'N/A'),
            'hireable': details.get('hireable', False),
            'bio': details.get('bio', 'N/A'),
            'public_repos': details['public_repos'],
            'followers': details['followers'],
            'following': details['following'],
            'created_at': details['created_at']
        })

        # Get repositories for the user
        repos = get_user_repositories(user['login'])
        for repo in repos:
            repo_details.append({
                'login': user['login'],
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo.get('language', 'N/A'),
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['name'] if repo.get('license') else 'N/A'
            })

    # Save the user details to users.csv
    df_users = pd.DataFrame(user_details)
    df_users.to_csv('users.csv', index=False)
    print("users.csv file has been created.")

    # Save the repository details to repositories.csv
    df_repos = pd.DataFrame(repo_details)
    df_repos.to_csv('repositories.csv', index=False)
    print("repositories.csv file has been created.")

if __name__ == "__main__":
    main()


Processing user: michaelliao
Processing user: daimajia
Processing user: xiaolai
Processing user: draveness
Processing user: hongyangAndroid
Processing user: haoel
Processing user: wizardforcel
Processing user: i5ting
Processing user: 521xueweihan
Processing user: ityouknow
Processing user: PKUFlyingPig
Processing user: singwhatiwanna
Processing user: tangqiaoboy
Processing user: gaoxiang12
Processing user: liuhuanyong
Processing user: yanhaijing
Processing user: HcySunYang
Processing user: julycoding
Processing user: jindongwang
Processing user: rfyiamcool
Processing user: Terry-Mao
Processing user: chyyuu
Processing user: cch123
Processing user: rootsongjc
Processing user: ymcui
Processing user: dongweiming
Processing user: zce
Processing user: wu-sheng
Processing user: sunnyxx
Processing user: wangfupeng1988
Processing user: johnlui
Processing user: lilydjwg
Processing user: thunlp
Processing user: zhengmin1989
Processing user: baoyongzhang
Processing user: shenghy
Processing user: w

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!!pip install pandas requests




In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Convert 'created_at' to datetime format for accurate sorting
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by 'created_at' in ascending order and select the top 5 earliest users
earliest_users = users_df.sort_values(by='created_at', ascending=True).head(5)

# Extract the 'login' column and format it as a comma-separated string
earliest_users_logins = ','.join(earliest_users['login'].tolist())

print("Earliest registered GitHub users in Beijing:", earliest_users_logins)


Earliest registered GitHub users in Beijing: robin,nwind,reeze,kejun,ZhangHanDong


## **top 3 most popular license**

In [None]:
import pandas as pd

# Load repositories.csv into a DataFrame
repos_df = pd.read_csv('/content/repositories.csv')

# Drop rows where 'license_name' is empty or null
repos_df = repos_df.dropna(subset=['license_name'])
repos_df = repos_df[repos_df['license_name'] != ""]

# Count the occurrences of each license name
license_counts = repos_df['license_name'].value_counts()

# Get the top 3 most common licenses
top_licenses = license_counts.head(3).index.tolist()

# Format as a comma-separated string
top_licenses_str = ','.join(top_licenses)

print("Most popular licenses:", top_licenses_str)


Most popular licenses: MIT License,Apache License 2.0,Other


## **to get cleaned users.csv**

In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Clean up the 'company' column
def clean_company_name(company):
    if pd.isna(company):
        return ""  # Return empty string if the company is NaN
    cleaned_company = company.strip()        # Trim whitespace
    cleaned_company = cleaned_company.lstrip('@')  # Remove leading '@'
    cleaned_company = cleaned_company.upper()      # Convert to uppercase
    return cleaned_company

# Apply the cleaning function to the 'company' column
users_df['company'] = users_df['company'].apply(clean_company_name)

# Save the cleaned data back to users.csv (or to a new file if you prefer)
users_df.to_csv('users_cleaned.csv', index=False)

print("Company column cleaned and saved to 'users_cleaned.csv'")


Company column cleaned and saved to 'users_cleaned.csv'


## **company which majority devlopers work at**

In [None]:
import pandas as pd

# Load the cleaned users data
users_df = pd.read_csv('/content/users_cleaned.csv')

# Filter out empty company names
companies = users_df['company'].dropna()
companies = companies[companies != ""]

# Find the most common company
most_common_company = companies.value_counts().idxmax()

print("The majority of these developers work at:", most_common_company)


The majority of these developers work at: BYTEDANCE


### **second most popular language**

In [None]:
import pandas as pd

# Load users and repositories data
users_df = pd.read_csv('/content/users.csv')
repos_df = pd.read_csv('/content/repositories.csv')

# Convert 'created_at' to datetime format in users data
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
recent_users = users_df[users_df['created_at'].dt.year > 2020]

# Get the logins of these recent users
recent_user_logins = recent_users['login'].unique()

# Filter repositories for these recent users
recent_repos = repos_df[repos_df['login'].isin(recent_user_logins)]

# Drop rows with missing or empty 'language' values
recent_repos = recent_repos.dropna(subset=['language'])
recent_repos = recent_repos[recent_repos['language'] != ""]

# Find the second most popular programming language
language_counts = recent_repos['language'].value_counts()
second_most_popular_language = language_counts.index[1]  # Index 1 for second most popular

print("The second most popular programming language among users who joined after 2020 is:", second_most_popular_language)


The second most popular programming language among users who joined after 2020 is: HTML


### **averahe no of stars per repository**

In [None]:
import pandas as pd

# Load repositories.csv into a DataFrame
repos_df = pd.read_csv('/content/repositories.csv')

# Drop rows where 'language' or 'stargazers_count' is missing or empty
repos_df = repos_df.dropna(subset=['language', 'stargazers_count'])
repos_df = repos_df[repos_df['language'] != ""]

# Calculate the average stars per language
average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

# Find the language with the highest average number of stars
top_language = average_stars_per_language.idxmax()
top_avg_stars = average_stars_per_language.max()

print("The language with the highest average number of stars per repository is:", top_language)
print("Average stars:", top_avg_stars)


The language with the highest average number of stars per repository is: Jinja
Average stars: 3418.0


### **Leader strength**

In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and select the top 5 users
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Extract the 'login' column and format it as a comma-separated string
top_leaders_logins = ','.join(top_leaders['login'].tolist())

print("Top 5 users by leader_strength:", top_leaders_logins)


Top 5 users by leader_strength: michaelliao,ityouknow,liuhuanyong,thunlp,shenghy


# **correlation**

In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Filter users located in Beijing
beijing_users = users_df[users_df['location'].str.contains("Beijing", case=False, na=False)]

# Calculate the correlation between followers and public_repos
correlation = beijing_users['followers'].corr(beijing_users['public_repos'])

print("Correlation between followers and public repositories:", f"{correlation:.3f}")


Correlation between followers and public repositories: 0.033


## **regression**

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Filter users located in Beijing
beijing_users = users_df[users_df['location'].str.contains("Beijing", case=False, na=False)]

# Reshape data for regression
X = beijing_users[['public_repos']]  # Independent variable (public repositories)
y = beijing_users['followers']       # Dependent variable (followers)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the regression slope
slope = model.coef_[0]

print("Regression slope of followers on repos:", f"{slope:.3f}")


Regression slope of followers on repos: 0.654



Question 11



In [None]:
import pandas as pd

# Load repositories.csv into a DataFrame
repos_df = pd.read_csv('/content/repositories.csv')

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

print("Correlation between having projects and wikis enabled:", f"{correlation:.3f}")


Correlation between having projects and wikis enabled: 0.277


second popular language

In [None]:
import pandas as pd

# Load users and repositories data
users_df = pd.read_csv('/content/users.csv')
repos_df = pd.read_csv('/content/repositories.csv')

# Convert 'created_at' to datetime format in users data
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
recent_users = users_df[users_df['created_at'].dt.year > 2020]

# Get the logins of these recent users
recent_user_logins = recent_users['login'].unique()

# Filter repositories for these recent users
recent_repos = repos_df[repos_df['login'].isin(recent_user_logins)]

# Drop rows with missing or empty 'language' values
recent_repos = recent_repos.dropna(subset=['language'])
recent_repos = recent_repos[recent_repos['language'] != ""]

# Count the occurrences of each language
language_counts = recent_repos['language'].value_counts()

# Get the second most popular programming language
second_most_popular_language = language_counts.index[1]  # Index 1 for second most popular

print("The second most popular programming language among users who joined after 2020 is:", second_most_popular_language)


The second most popular programming language among users who joined after 2020 is: HTML


In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users_modified.csv')

# Calculate the average following for hireable users (hireable = true)
hireable_avg = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate the average following for non-hireable users (hireable = false)
non_hireable_avg = users_df[users_df['hireable'] == False]['following'].mean()

# Calculate the difference
average_difference = hireable_avg - non_hireable_avg

print("Average following for hireable users minus non-hireable users:", f"{average_difference:.3f}")


Average following for hireable users minus non-hireable users: nan


In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Calculate the length of the bio in words, ignoring whitespace
users_df['bio_length'] = users_df['bio'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)

# Filter out users without a bio (length = 0)
bio_users = users_df[users_df['bio_length'] > 0]

# Calculate the correlation between bio length and number of followers
correlation = bio_users['bio_length'].corr(bio_users['followers'])

print("Correlation between bio length and followers:", f"{correlation:.3f}")


Correlation between bio length and followers: -0.019


In [None]:
import pandas as pd

# Load repositories.csv into a DataFrame
repos_df = pd.read_csv('/content/repositories_modified.csv')

# Convert 'created_at' to datetime format
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for repositories created on weekends (Saturday = 5, Sunday = 6)
repos_df['is_weekend'] = repos_df['created_at'].dt.dayofweek.isin([5, 6])

# Count the number of repositories created on weekends by each user
weekend_repos = repos_df[repos_df['is_weekend']].groupby('login').size()

# Get the top 5 users with the most repositories created on weekends
top_weekend_users = weekend_repos.nlargest(5)

# Extract the 'login' column and format it as a comma-separated string
top_weekend_logins = ','.join(top_weekend_users.index)

print("Top 5 users who created the most repositories on weekends:", top_weekend_logins)


Top 5 users who created the most repositories on weekends: LinuxSuRen,zhufengnodejs,xiaoweiruby,i5ting,hailiang-wang


In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Total number of hireable users
total_hireable = users_df[users_df['hireable'] == True].shape[0]

# Total number of hireable users who have an email
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notna().sum()

# Calculate the fraction for hireable users
hireable_email_fraction = hireable_with_email / total_hireable if total_hireable > 0 else 0

# Total number of non-hireable users
total_non_hireable = users_df[users_df['hireable'] == False].shape[0]

# Total number of non-hireable users who have an email
non_hireable_with_email = users_df[users_df['hireable'] == False]['email'].notna().sum()

# Calculate the fraction for non-hireable users
non_hireable_email_fraction = non_hireable_with_email / total_non_hireable if total_non_hireable > 0 else 0

# Calculate the difference
email_fraction_difference = hireable_email_fraction - non_hireable_email_fraction

print("Difference in fraction of users with email (hireable - non-hireable):", f"{email_fraction_difference:.3f}")


Difference in fraction of users with email (hireable - non-hireable): 0.752


In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users_modified.csv')

# Extract surnames by trimming and splitting by whitespace
# We'll ignore rows with missing names
users_df['surname'] = users_df['name'].dropna().apply(lambda x: x.strip().split()[-1])

# Count occurrences of each surname
surname_counts = users_df['surname'].value_counts()

# Identify the most common surname(s)
most_common_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == most_common_count].index.tolist()

# Sort surnames alphabetically
most_common_surnames.sort()

# Prepare output
surnames_output = ','.join(most_common_surnames)

print("Most common surname(s):", surnames_output)
print("Number of users with the most common surname:", most_common_count)


Most common surname(s): Zhang
Number of users with the most common surname: 11


In [None]:
import pandas as pd

# Function to convert boolean values to 'true' and 'false'
def convert_booleans(df):
    boolean_columns = df.select_dtypes(include='bool').columns.tolist()  # Identify boolean columns
    for col in boolean_columns:
        df[col] = df[col].replace({True: 'true', False: 'false'})
    return df

# Load users.csv and repositories.csv
users_df = pd.read_csv('/content/users.csv')
repos_df = pd.read_csv('/content/repositories.csv')

# Convert booleans in both DataFrames
users_df = convert_booleans(users_df)
repos_df = convert_booleans(repos_df)

# Save the modified DataFrames back to CSV
users_df.to_csv('users_modified.csv', index=False)
repos_df.to_csv('repositories_modified.csv', index=False)

print("Boolean values have been converted and saved to 'users_modified.csv' and 'repositories_modified.csv'.")


Boolean values have been converted and saved to 'users_modified.csv' and 'repositories_modified.csv'.


second most popular language

In [None]:
import pandas as pd

# Load users and repositories data
users_df = pd.read_csv('/content/users.csv')
repos_df = pd.read_csv('/content/repositories.csv')

# Convert 'created_at' to datetime format in users data
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
recent_users = users_df[users_df['created_at'].dt.year > 2020]

# Get the logins of these recent users
recent_user_logins = recent_users['login'].unique()

# Filter repositories for these recent users
recent_repos = repos_df[repos_df['login'].isin(recent_user_logins)]

# Drop rows with missing or empty 'language' values
recent_repos = recent_repos.dropna(subset=['language'])
recent_repos = recent_repos[recent_repos['language'] != ""]

# Count the occurrences of each language
language_counts = recent_repos['language'].value_counts()

# Get the second most popular programming language
second_most_popular_language = language_counts.index[1]  # Index 1 for second most popular
second_most_popular_count = language_counts.iloc[1]  # Count of the second most popular language

print("The second most popular programming language among users who joined after 2020 is:", second_most_popular_language)
print("Number of repositories in this language:", second_most_popular_count)


The second most popular programming language among users who joined after 2020 is: HTML
Number of repositories in this language: 6


question 16

In [None]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Extract surnames by trimming and splitting by whitespace
# We'll ignore rows with missing names
users_df['surname'] = users_df['name'].dropna().apply(lambda x: x.strip().split()[-1])

# Count occurrences of each surname
surname_counts = users_df['surname'].value_counts()

# Identify the most common surname(s)
most_common_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == most_common_count].index.tolist()

# Sort surnames alphabetically
most_common_surnames.sort()

# Prepare output
surnames_output = ','.join(most_common_surnames)

print("Most common surname(s):", surnames_output)
print("Number of users with the most common surname:", most_common_count)


Most common surname(s): Zhang
Number of users with the most common surname: 11


### Question 15

In [None]:
import pandas as pd

def analyze_email_sharing(users_csv_path='/content/users.csv'):
    # Read the complete CSV file
    df = pd.read_csv(users_csv_path)

    # Convert email column to boolean (True if email exists, False if NaN or empty)
    df['has_email'] = df['email'].notna() & (df['email'] != '')

    # Calculate for hireable users
    hireable_mask = df['hireable'] == True
    if hireable_mask.any():
        hireable_email_fraction = df[hireable_mask]['has_email'].mean()
    else:
        hireable_email_fraction = 0

    # Calculate for non-hireable users
    non_hireable_mask = df['hireable'] != True
    if non_hireable_mask.any():
        non_hireable_email_fraction = df[non_hireable_mask]['has_email'].mean()
    else:
        non_hireable_email_fraction = 0

    # Calculate difference and round to 3 decimal places
    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)

    # Print debug information
    print(f"Total users: {len(df)}")
    print(f"Hireable users with email: {df[hireable_mask]['has_email'].sum()}/{hireable_mask.sum()}")
    print(f"Non-hireable users with email: {df[non_hireable_mask]['has_email'].sum()}/{non_hireable_mask.sum()}")
    print(f"Hireable fraction: {hireable_email_fraction:.3f}")
    print(f"Non-hireable fraction: {non_hireable_email_fraction:.3f}")

    return difference

# Read and analyze the complete dataset
result = analyze_email_sharing()
print(f"\nFinal result: {result:.3f}")

Total users: 360
Hireable users with email: 76/101
Non-hireable users with email: 176/259
Hireable fraction: 0.752
Non-hireable fraction: 0.680

Final result: 0.073


## **question 14**

In [None]:
import csv
from collections import Counter
from datetime import datetime

# Counter to store the number of repositories created by each user on weekends
weekend_repo_counts = Counter()

# Open the repositories.csv file and read data
with open('/content/repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            # Convert created_at string to a datetime object
            created_date = datetime.fromisoformat(created_at[:-1])  # Remove 'Z' and convert

            # Check if the day is Saturday (5) or Sunday (6)
            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1  # Increment the count for the user

# Get the top 5 users who created the most repositories on weekends
top_users = weekend_repo_counts.most_common(5)

# Extract the logins of the top users
top_logins = [user[0] for user in top_users]

# Output the top users' logins as a comma-separated string
print(','.join(top_logins))


LinuxSuRen,zhufengnodejs,xiaoweiruby,i5ting,mozillazg


**question 11**

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Load the CSV file
csv_file = '/content/repositories.csv'  # Replace with the correct path

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Convert 'has_projects' and 'has_wiki' to boolean if necessary
df['has_projects'] = df['has_projects'].astype(bool)
df['has_wiki'] = df['has_wiki'].astype(bool)

# Create a contingency table
contingency_table = pd.crosstab(df['has_projects'], df['has_wiki'])

# Perform Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")


Chi-Square Statistic: 2254.041955935203
P-value: 0.0


question 13

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

def analyze_bio_followers_correlation(users_csv_path='/content/users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Filter out rows without bios
    df = df[df['bio'].notna() & (df['bio'] != '')]

    # Calculate bio length in Unicode characters
    df['bio_length'] = df['bio'].str.len()

    # Prepare data for regression
    X = df['bio_length'].values.reshape(-1, 1)
    y = df['followers'].values

    # Perform linear regression
    model = LinearRegression()
    model.fit(X, y)

    # Get the slope rounded to 3 decimal places
    slope = round(model.coef_[0], 3)

    # Print debug information
    print(f"Number of users with bios: {len(df)}")
    print(f"Bio length range: {df['bio_length'].min()} to {df['bio_length'].max()}")
    print(f"Followers range: {df['followers'].min()} to {df['followers'].max()}")
    print(f"R-squared: {model.score(X, y):.3f}")

    return slope

# Calculate the regression slope
result = analyze_bio_followers_correlation()
print(f"\nRegression slope: {result:.3f}")

Number of users with bios: 272
Bio length range: 2 to 160
Followers range: 502 to 37235
R-squared: 0.001

Regression slope: -2.994


In [None]:
import csv
from collections import Counter

# Define the list to store license names
licenses = []

# Read the CSV file with UTF-8 encoding
with open('/content/repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Check if the license_name field is present and not empty
        license_name = row.get('license_name', '').strip()
        if license_name:
            licenses.append(license_name)

# Count the occurrence of each license
license_counts = Counter(licenses)

# Get the 3 most common licenses
top_3_licenses = [license for license, count in license_counts.most_common(3)]

# Print the result as a comma-separated list
print(','.join(top_3_licenses))


N/A,MIT License,Apache License 2.0


In [None]:
import csv
from collections import Counter
from datetime import datetime

# Define the list to store programming languages
languages = []

# Read the CSV file with UTF-8 encoding
with open('/content/repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    # Iterate through the rows in the CSV
    for row in reader:
        # Parse the created_at field
        created_at = row.get('created_at', '').strip()

        # Convert the date string to a datetime object
        if created_at:
            user_join_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")

            # Check if the user joined after 2020
            if user_join_date.year > 2020:
                # Get the language field and clean it up
                language = row.get('language', '').strip()
                if language:
                    languages.append(language)

# Count the occurrence of each language
language_counts = Counter(languages)

# Find the two most common languages
most_common_languages = language_counts.most_common(2)

# Print the second most common language
if len(most_common_languages) >= 2:
    print(most_common_languages[1][0])  # Second most common language
else:
    print("Not enough language data found.")


JavaScript


In [None]:
import csv
from collections import Counter
from datetime import datetime

# Define the list to store programming languages
languages = []

# Read the CSV file with UTF-8 encoding
with open('/content/repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    # Iterate through the rows in the CSV
    for row in reader:
        # Parse the created_at field
        created_at = row.get('created_at', '').strip()

        # Convert the date string to a datetime object
        if created_at:
            user_join_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")

            # Check if the user joined after 2020
            if user_join_date.year > 2020:
                # Get the language field and clean it up
                language = row.get('language', '').strip()
                if language:
                    languages.append(language)

# Count the occurrence of each language
language_counts = Counter(languages)

# Find the two most common languages
most_common_languages = language_counts.most_common(2)

# Print the second most common language
if len(most_common_languages) >= 2:
    print(most_common_languages[1][0])  # Second most common language
else:
    print("Not enough language data found.")


JavaScript


Url- https://github.com/uma1979/github-api-analysis

Q1 - michaelliao,daimajia,xiaolai,draveness,hongyangAndroid
Q2- robin,nwind,reeze,kejun,ZhangHanDong
Q3 -N/A,MIT License,Apache License 2.0
Q4 -BYTEDANCE
Q5-JavaScript
Q6-JavaScript
Q7-Jinja
Q8-michaelliao,ityouknow,liuhuanyong,thunlp,shenghy
Q9-0.033
Q10-0.654
Q11-NaN
Q12-NaN
Q13 - no answer
Q14-LinuxSuRen,zhufengnodejs,xiaoweiruby,i5ting,mozillazg
Q15-NaN
Q16-Zhang

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the users data from the CSV file
users_df = pd.read_csv('/content/users.csv')

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna()]

# Calculate the length of the bio in words
#users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split(" ").str.len()

# The error was here: users_with_bio was used instead of users_with_bios
users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


# Prepare the data for regression
X = users_with_bios['bio_word_count'] # Independent variable
y = users_with_bios['followers'] # Dependent variable

# Add a constant to the independent variable for the regression
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the regression slope (coefficient for bio_word_count)
slope = model.params['bio_word_count']

# Print the slope rounded to three decimal places
print(f'Regression slope of followers on bio word count: {slope:.3f}')

Regression slope of followers on bio word count: -11.022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


### **Question 11**

In [None]:
import pandas as pd

# Load the data
repositories_df = pd.read_csv('/content/repositories.csv')


# Calculate the correlation directly
correlation = repositories_df['has_projects'].astype(int).corr(repositories_df['has_wiki'].astype(int))

print(f"The correlation between having projects enabled and having a wiki enabled is: {correlation:.3f}")


The correlation between having projects enabled and having a wiki enabled is: 0.277


### **Question 12**

In [None]:
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('/content/users.csv')

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

# Calculate average following for both groups
average_hireable_following = hireable_users['following'].mean()
average_non_hireable_following = non_hireable_users['following'].mean()

# Calculate the difference
difference = average_hireable_following - average_non_hireable_following

# Print the result rounded to three decimal places
print(f'Difference in average following (hireable - non-hireable): {difference:.3f}')


Difference in average following (hireable - non-hireable): 148.994


### Question 15

In [None]:
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('/content/users.csv')

# Total number of users
total_users = len(users_df)

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

# Calculate the fraction of users with email in both groups
fraction_hireable_with_email = hireable_users['email'].notna().mean()
fraction_non_hireable_with_email = non_hireable_users['email'].notna().mean()

# Calculate the difference
difference = fraction_hireable_with_email - fraction_non_hireable_with_email

# Print the result rounded to three decimal places
print(f'Difference in fraction of users with email: {difference:.3f}')


Difference in fraction of users with email: 0.073


In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/content/users.csv')

In [None]:
data.head(10)

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,michaelliao,Crypto Michael,,"Beijing, China",askxuefeng@gmail.com,,Crypto developer.,99,37235,3,2010-11-06T12:21:35Z
1,daimajia,代码家,ZhenFund Beijing,"Beijing, China",daimajia@gmail.com,,Zhenfund VP of Investment.,89,24633,271,2012-10-07T02:40:06Z
2,xiaolai,xiaolai,inblockchain,beijing,lixiaolai@gmail.com,,A lifelong student.,54,19241,37,2009-11-13T18:29:42Z
3,draveness,Draven,@spectra-fund,"Beijing, China",i@draven.co,True,HFT / C++ / Go,50,13010,28,2014-01-24T16:22:01Z
4,hongyangAndroid,张鸿洋,wanandroid.com,"Beijing,China",623565791@qq.com,True,学习ing,102,12985,35,2015-01-26T07:05:45Z
5,haoel,Hao Chen,MegaEase,Beijing,haoel@hotmail.com,,Founder of MegaEase Inc. \n\n芝兰生于空谷，不以无人而不芳。\n,24,12624,32,2011-08-30T14:05:18Z
6,wizardforcel,布客飞龙,@258ch @ApacheCN,"Beijing, China",admin@flygon.net,True,无产阶级没有版权。,86,9593,659,2013-07-24T09:40:02Z
7,i5ting,狼叔,alibaba,china beijing,i5ting@126.com,True,focus on node & rust & web3。\n2023 Yak Shaving...,902,9072,1127,2012-12-24T23:28:15Z
8,521xueweihan,削微寒,公众号：HelloGitHub,"Beijing, China",595666367@qq.com,,时间会让「平凡的事」变得「与众不同」\nTime will make ordinary th...,145,8972,178,2014-07-24T10:16:33Z
9,ityouknow,纯洁的微笑,Freedom and dreams,"beijing,china",ityouknow@126.com,,"Open source is a spirit, I enjoy it.",28,8779,1,2013-07-10T07:23:28Z


In [None]:
d=pd.read_csv('/content/users.csv')
d.head(10)

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,michaelliao,Crypto Michael,,"Beijing, China",askxuefeng@gmail.com,,Crypto developer.,99,37235,3,2010-11-06T12:21:35Z
1,daimajia,代码家,ZhenFund Beijing,"Beijing, China",daimajia@gmail.com,,Zhenfund VP of Investment.,89,24633,271,2012-10-07T02:40:06Z
2,xiaolai,xiaolai,inblockchain,beijing,lixiaolai@gmail.com,,A lifelong student.,54,19241,37,2009-11-13T18:29:42Z
3,draveness,Draven,@spectra-fund,"Beijing, China",i@draven.co,True,HFT / C++ / Go,50,13010,28,2014-01-24T16:22:01Z
4,hongyangAndroid,张鸿洋,wanandroid.com,"Beijing,China",623565791@qq.com,True,学习ing,102,12985,35,2015-01-26T07:05:45Z
5,haoel,Hao Chen,MegaEase,Beijing,haoel@hotmail.com,,Founder of MegaEase Inc. \n\n芝兰生于空谷，不以无人而不芳。\n,24,12624,32,2011-08-30T14:05:18Z
6,wizardforcel,布客飞龙,@258ch @ApacheCN,"Beijing, China",admin@flygon.net,True,无产阶级没有版权。,86,9593,659,2013-07-24T09:40:02Z
7,i5ting,狼叔,alibaba,china beijing,i5ting@126.com,True,focus on node & rust & web3。\n2023 Yak Shaving...,902,9072,1127,2012-12-24T23:28:15Z
8,521xueweihan,削微寒,公众号：HelloGitHub,"Beijing, China",595666367@qq.com,,时间会让「平凡的事」变得「与众不同」\nTime will make ordinary th...,145,8972,178,2014-07-24T10:16:33Z
9,ityouknow,纯洁的微笑,Freedom and dreams,"beijing,china",ityouknow@126.com,,"Open source is a spirit, I enjoy it.",28,8779,1,2013-07-10T07:23:28Z
