# <center> Scraping GitHub </center>

### Importation

In [1]:
import requests
import csv
import pandas as pd
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

### GitHub API base URL

In [2]:
base_url = 'https://api.github.com'

### GitHub access token

In [3]:
access_token = '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'

### Number of repositories per day

In [4]:
repositories_per_day = 250

### Number of days to scrape

In [5]:
days_to_scrape = 730

### Calculer les dates de début et de fin du scraping

In [6]:
end_date = datetime.now().date()
start_date = end_date - timedelta(days=days_to_scrape)

### Initialiser la liste des référentiels

In [7]:
repositories = []

### Mécanisme de nouvelle tentative pour API requests

In [8]:
retry_strategy = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["GET"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

  retry_strategy = Retry(


### Itérer chaque jour

In [10]:
for day in range(days_to_scrape):
    current_date = start_date + timedelta(days=day)
    formatted_date = current_date.strftime('%Y-%m-%d')

    # Récupérer des référentiels à l'aide de la pagination
    page = 1
    while len(repositories) < repositories_per_day * (day + 1):
        # Créeation l'URL de l'API pour récupérer les référentiels créés le jour en cours et une page spécifique
        url = f'{base_url}/search/repositories?q=created:{formatted_date}&sort=stars&order=desc&per_page=100&page={page}'

        # Faire la demande d'API avec une logique de nouvelle tentative
        headers = {'Authorization': f'token {access_token}'} if access_token else {}
        response = http.get(url, headers=headers)
        data = response.json()

        if 'items' in data:
            # Extraire les informations du référentiel à partir de la réponse
            items = data['items']
            sorted_items = sorted(items, key=lambda item: item['stargazers_count'], reverse=True)

            for item in sorted_items[:repositories_per_day]:
                repository = {
                    'id': item['id'],
                    'full name': item['full_name'],
                    'name': item['name'],
                    'url': item['html_url'],
                    'description': item['description'],
                    'stars': item['stargazers_count'],
                    'language': item.get('language', ''),  # Utilisez dict.get() avec une valeur par défaut
                    'forks': item['forks'],
                    'watchers': item['watchers'],
                    'open issues': item['open_issues'],
                    'owner name': item['owner']['login'],
                    'owner type': item['owner']['type'],
                    'owner site admin': item['owner']['site_admin'],
                    'license name': item['license']['name'] if item['license'] else '',
                    'license key': item['license']['key'] if item['license'] else '',
                    'topics': item['topics'],
                    'creation date': item['created_at'],
                    'updated at': item['updated_at'],
                    'pushed at': item['pushed_at'],
                    'size': item ['size'],
                    'score': item['score'],
                    'has projects': item['has_projects'],
                    'has downloads': item['has_downloads'],
                    'has wiki': item['has_wiki'],
                    'has discussions': item['has_discussions'],
                    'permissions': item['permissions'],
                    'has discussions': item['has_discussions'],
                    'mirror url': item['mirror_url'],
                    'archived': item['archived'],
                    'disabled': item['disabled'],
                    'allow forking': item['allow_forking'],
                    'is template': item['is_template'],
                    'web commit signoff required': item['web_commit_signoff_required']
                }

                repositories.append(repository)

        page += 1

        if 'next' not in response.links:
            break

### Output CSV file name

In [11]:
output_file = 'repositories_data.csv'

### Write the repository data to a CSV file

In [12]:
with open(output_file, 'w', newline='', encoding='utf-8') as file:
    fieldnames = ['id','full name','name', 'url', 'description',
                  'stars', 'language','forks','watchers','open issues',
                  'owner name','owner type','owner site admin','license name','license key',
                  'topics', 'creation date','updated at','pushed at',
                  'size','score','has projects','has downloads','has wiki',
                  'has discussions','permissions','has discussions','archived','disabled',
                  'allow forking','is template','web commit signoff required']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(repositories)

print('Repositories scraped and saved successfully!')

Repositories scraped and saved successfully!


### Save to Excel file 

In [14]:
# Convert extracted data to a DataFrame
df = pd.DataFrame(repositories)

# Define the file paths
excel_file_path = 'repositories_data.xlsx'

# Write the data to the Excel file
df.to_excel(excel_file_path, index=False)

### Read data

In [13]:
df = pd.read_csv('repositories_data.csv')
df

  df = pd.read_csv('repositories_data.csv')


Unnamed: 0,id,full name,name,url,description,stars,language,forks,watchers,open issues,...,has wiki,has discussions,permissions,has discussions.1,mirror url,archived,disabled,allow forking,is template,web commit signoff required
0,379414969,jamesstringerparsec/Easy-GPU-PV,Easy-GPU-PV,https://github.com/jamesstringerparsec/Easy-GP...,A Project dedicated to making GPU Partitioning...,3014,PowerShell,324,3014,142,...,True,False,"{'admin': False, 'maintain': False, 'push': Fa...",False,,False,False,True,False,False
1,379231368,spencerwooo/onedrive-vercel-index,onedrive-vercel-index,https://github.com/spencerwooo/onedrive-vercel...,"OneDrive public directory listing, powered by ...",2609,TypeScript,3231,2609,28,...,False,True,"{'admin': False, 'maintain': False, 'push': Fa...",True,,False,False,True,False,False
2,379247713,NVlabs/alias-free-gan,alias-free-gan,https://github.com/NVlabs/alias-free-gan,Alias-Free GAN project website and code,1332,,43,1332,0,...,False,False,"{'admin': False, 'maintain': False, 'push': Fa...",False,,False,False,True,False,False
3,379429942,fishfolk/jumpy,jumpy,https://github.com/fishfolk/jumpy,Tactical 2D shooter in fishy pixels style. Mad...,1307,Rust,101,1307,62,...,False,True,"{'admin': False, 'maintain': False, 'push': Fa...",True,,False,False,True,False,False
4,379360003,OffcierCia/Crypto-OpSec-SelfGuard-RoadMap,Crypto-OpSec-SelfGuard-RoadMap,https://github.com/OffcierCia/Crypto-OpSec-Sel...,"Here we collect and discuss the best DeFi, Blo...",1070,,91,1070,0,...,True,True,"{'admin': False, 'maintain': False, 'push': Fa...",True,,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182594,656805552,Manlostech/Compolotics,Compolotics,https://github.com/Manlostech/Compolotics,This is my personal blog / portfolio website. ...,3,JavaScript,0,3,0,...,True,False,"{'admin': False, 'maintain': False, 'push': Fa...",False,,False,False,True,False,False
182595,656838271,swyxio/ai-engineer,ai-engineer,https://github.com/swyxio/ai-engineer,AI Engineer website,3,TypeScript,0,3,0,...,True,False,"{'admin': False, 'maintain': False, 'push': Fa...",False,,False,False,True,False,False
182596,656897180,Irengs1/Irengs1,Irengs1,https://github.com/Irengs1/Irengs1,Config files for my GitHub profile.,3,,0,3,0,...,False,False,"{'admin': False, 'maintain': False, 'push': Fa...",False,,False,False,True,False,False
182597,656634242,GrayJimars/WHU2023SecurityMakersOpenSource,WHU2023SecurityMakersOpenSource,https://github.com/GrayJimars/WHU2023SecurityM...,,3,C,1,3,0,...,True,False,"{'admin': False, 'maintain': False, 'push': Fa...",False,,False,False,True,False,False


# <center> Fin </center>