# <center> Scraping GitHub </center>

### Importation

In [1]:
import requests
import csv
import pandas as pd
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

### GitHub API base URL

In [2]:
base_url = 'https://api.github.com'

### My GitHub username

In [3]:
username = 'yaserrati'

### GitHub access token

In [4]:
access_token = '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'

### Number of repositories per day

In [5]:
repositories_per_day = 750

### Number of days to scrape

In [6]:
days_to_scrape = 365

### Calculer les dates de début et de fin du scraping

In [7]:
end_date = datetime.now().date()
start_date = end_date - timedelta(days=days_to_scrape)

### Initialiser la liste des référentiels

In [8]:
repositories = []

### Mécanisme de nouvelle tentative pour API requests

In [9]:
retry_strategy = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, ², 502, 503, 504],
    method_whitelist=["GET"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

  retry_strategy = Retry(


### Itérer chaque jour

In [10]:
for day in range(days_to_scrape):
    current_date = start_date + timedelta(days=day)
    formatted_date = current_date.strftime('%Y-%m-%d')

    # Fetch repositories using pagination
    page = 1
    while len(repositories) < repositories_per_day * (day + 1):
        # Create the API URL to fetch repositories created on the current day and specific page
        url = f'{base_url}/search/repositories?q=created:{formatted_date}&sort=stars&order=desc&per_page=100&page={page}'

        # Make the API request with retry logic
        headers = {'Authorization': f'token {access_token}'} if access_token else {}
        response = http.get(url, headers=headers)
        data = response.json()

        if 'items' in data:
            # Extract repository information from the response
            items = data['items']
            sorted_items = sorted(items, key=lambda item: item['stargazers_count'], reverse=True)

            for item in sorted_items[:repositories_per_day]:
                repository = {
                    'name': item['name'],
                    'url': item['html_url'],
                    'description': item['description'],
                    'stars': item['stargazers_count'],
                    'created_at': item['created_at'],
                    'language': item.get('language', ''),  # Use dict.get() with a default value
                    'forks': item['forks'],
                    'watchers': item['watchers'],
                    'open_issues': item['open_issues'],
                    'owner': item['owner']['login']
                }

                repositories.append(repository)

        page += 1

        if 'next' not in response.links:
            break

### Output CSV file name

In [11]:
output_file = 'repositories_data.csv'

### Write the repository data to a CSV file

In [12]:
with open(output_file, 'w', newline='', encoding='utf-8') as file:
    fieldnames = ['name', 'url', 'description', 'stars', 'created_at', 'language','forks','watchers','open_issues','owner']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(repositories)

print('Repositories scraped and saved successfully!')

Repositories scraped and saved successfully!


### Save to Excel file 

In [13]:
# Convert extracted data to a DataFrame
df = pd.DataFrame(repositories)

# Define the file paths
excel_file_path = 'repositories_data.xlsx'

# Write the data to the Excel file
df.to_excel(excel_file_path, index=False)

### Read data

In [14]:
df = pd.read_csv('repositories_data.csv')
df

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
0,notepad--,https://github.com/cxasm/notepad--,一个支持windows/linux/mac的文本编辑器，目标是做中国人自己的编辑器，来自中国。,2877,2022-06-15T02:50:15Z,C++,114,2877,111,cxasm
1,pocket-casts-android,https://github.com/Automattic/pocket-casts-and...,Pocket Casts Android 🎧,2243,2022-06-15T12:41:42Z,Kotlin,165,2243,252,Automattic
2,nvim-basic-ide,https://github.com/LunarVim/nvim-basic-ide,🪨 This is my attempt at a basic stable startin...,1653,2022-06-15T23:06:53Z,Lua,437,1653,1,LunarVim
3,sismo-hub,https://github.com/sismo-core/sismo-hub,,890,2022-06-15T16:44:53Z,TypeScript,181,890,7,sismo-core
4,Antenna,https://github.com/wuba/Antenna,Antenna是58同城安全团队打造的一款辅助安全从业人员验证网络中多种漏洞是否存在以及可利...,675,2022-06-15T06:42:25Z,JavaScript,72,675,3,wuba
...,...,...,...,...,...,...,...,...,...,...
273791,preparation,https://github.com/sxa4680/preparation,Proper preparation solves 80 percent of life’s...,2,2023-06-14T04:18:45Z,,2,2,0,sxa4680
273792,1254,https://github.com/tomenli/1254,1254,2,2023-06-14T04:54:55Z,,0,2,0,tomenli
273793,vm,https://github.com/wayneundefined/vm,,2,2023-06-14T06:44:18Z,,1,2,0,wayneundefined
273794,drugmanage,https://github.com/Derunter5958/drugmanage,管理系统的代码,2,2023-06-14T07:20:22Z,Vue,0,2,1,Derunter5958


### Drop les doublons

In [15]:
df.drop_duplicates()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
0,notepad--,https://github.com/cxasm/notepad--,一个支持windows/linux/mac的文本编辑器，目标是做中国人自己的编辑器，来自中国。,2877,2022-06-15T02:50:15Z,C++,114,2877,111,cxasm
1,pocket-casts-android,https://github.com/Automattic/pocket-casts-and...,Pocket Casts Android 🎧,2243,2022-06-15T12:41:42Z,Kotlin,165,2243,252,Automattic
2,nvim-basic-ide,https://github.com/LunarVim/nvim-basic-ide,🪨 This is my attempt at a basic stable startin...,1653,2022-06-15T23:06:53Z,Lua,437,1653,1,LunarVim
3,sismo-hub,https://github.com/sismo-core/sismo-hub,,890,2022-06-15T16:44:53Z,TypeScript,181,890,7,sismo-core
4,Antenna,https://github.com/wuba/Antenna,Antenna是58同城安全团队打造的一款辅助安全从业人员验证网络中多种漏洞是否存在以及可利...,675,2022-06-15T06:42:25Z,JavaScript,72,675,3,wuba
...,...,...,...,...,...,...,...,...,...,...
273791,preparation,https://github.com/sxa4680/preparation,Proper preparation solves 80 percent of life’s...,2,2023-06-14T04:18:45Z,,2,2,0,sxa4680
273792,1254,https://github.com/tomenli/1254,1254,2,2023-06-14T04:54:55Z,,0,2,0,tomenli
273793,vm,https://github.com/wayneundefined/vm,,2,2023-06-14T06:44:18Z,,1,2,0,wayneundefined
273794,drugmanage,https://github.com/Derunter5958/drugmanage,管理系统的代码,2,2023-06-14T07:20:22Z,Vue,0,2,1,Derunter5958


# <center> Fin </center>