In [3]:
! pip install pyspark
! pip install requests

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=24f8899d4156c43431d0cda518931040911b354022a6dcbe4b5691b67656d0bf
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [7]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, date_format, to_date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import logging

GITHUB_API_TOKEN = 'github_pat_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

headers = {
    'Authorization': f'token {GITHUB_API_TOKEN}'
}

def get_followers(username: str) -> dict:
    """
    Retrieves the followers of a given GitHub user.

    Args:
        username (str): The username of the GitHub user.

    Returns:
        dict: A dictionary containing the response from the API call.

    """
    try:
        url = f'https://api.github.com/users/{username}/followers'
        response = requests.get(url, headers=headers)

        return response.json()
    except Exception as e:
      logging.info(f"An error occurred while getting followers: {e}")
      raise e

def get_user_details(username: str) -> dict:
    """
    Retrieves the details of a user from the GitHub API.

    Args:
        username (str): The username of the user.

    Returns:
        dict: A dictionary containing the user details.
    """
    try:
        url = f'https://api.github.com/users/{username}'
        response = requests.get(url, headers=headers)

        return response.json()
    except Exception as e:
      logging.info(f"An error occurred while getting user details: {e}")
      raise e

if __name__ == "__main__":
    username = 'cvscarlos'
    followers = get_followers(username)

    spark = SparkSession.builder.appName("App").getOrCreate()

    schema = StructType([
        StructField('login', StringType(), True),
        StructField('name', StringType(), True),
        StructField('company', StringType(), True),
        StructField('blog', StringType(), True),
        StructField('email', StringType(), True),
        StructField('bio', StringType(), True),
        StructField('public_repos', IntegerType(), True),
        StructField('followers', IntegerType(), True),
        StructField('following', IntegerType(), True),
        StructField('created_at', StringType(), True),
    ])

    spark_df = spark.createDataFrame([], schema=schema)

    for follower in followers:
        user_data = get_user_details(follower['login'])
        user_df = spark.createDataFrame([user_data], schema=schema)
        spark_df = spark_df.union(user_df)

    spark_df = spark_df.withColumn("company", regexp_replace("company", "@", ""))
    spark_df = spark_df.withColumn("created_at", to_date("created_at", "yyyy-MM-dd'T'HH:mm:ss'Z'"))
    spark_df = spark_df.withColumn("created_at", date_format("created_at", "dd/MM/yyyy"))

    selected_cols = ['login', 'name', 'company', 'blog', 'email', 'bio', 'public_repos', 'followers', 'following', 'created_at']
    spark_df = spark_df.select(*selected_cols)

    spark_df.show(5)
    spark_df.write.csv(f"github_followers_{username}_.csv", header=True)

+----------------+------------------+--------------------+--------------------+--------------------+--------------------+------------+---------+---------+----------+
|           login|              name|             company|                blog|               email|                 bio|public_repos|followers|following|created_at|
+----------------+------------------+--------------------+--------------------+--------------------+--------------------+------------+---------+---------+----------+
|     felipegomes|      Felipe Gomes|            NEW/DELI|http://www.newdel...|felipegomesbwm@gm...|                NULL|           1|        2|        3|26/11/2008|
|          rduran|     Rodrigo Duran|                NULL|http://raduran.bl...|   raduran@gmail.com|                NULL|          10|        7|       39|05/10/2009|
|      fhferreira|Flávio H. Ferreira|   Homestaymatch.com|https://f-h-ferre...|                NULL|Senior Software D...|        1712|      270|     1279|16/10/2009|
|   