In [2]:
# importing required libraries
import requests
from bs4 import BeautifulSoup

import pandas as pd
import math
import re
import random 
from datetime import datetime
from collections import OrderedDict

import os
import io
import tokenize
import zipfile
import radon
import traceback
from radon.raw import analyze
from radon.visitors import ComplexityVisitor, HalsteadVisitor
from radon.metrics import *
from radon import metrics

from radon.complexity import cc_rank, cc_visit

from tqdm.notebook import tqdm #(for Jupyter notebook)
# from tqdm import tqdm   # for all other IDEs

pd.set_option('display.max_columns', 500)

In [3]:
def get_topics(nb_topics):
    topics_links=[]
    url=f"https://github.com/topics?page="
        
    for i in tqdm(range(1+(nb_topics//30))):
        pr=requests.get(url+str(i+1))
        pr_soup = BeautifulSoup(pr.text,'html.parser')
        #list of all topics in the page
        topics=pr_soup.find_all('a',{'href':re.compile('/topics/*'), "class":"flex-1"})
        topics_links.extend([(topic["href"], topic.findChild('p',{"class":"f5"}).text.strip()) for topic in topics])
        
    return topics_links[:nb_topics]

In [4]:
get_topics(100)

  0%|          | 0/4 [00:00<?, ?it/s]

[('/topics/3d',
  '3D modeling is the process of virtually developing the surface and structure of a 3D object.'),
 ('/topics/ajax',
  'Ajax is a technique for creating interactive web applications.'),
 ('/topics/algorithm',
  'Algorithms are self-contained sequences that carry out a variety of tasks.'),
 ('/topics/amphp', 'Amp is a non-blocking concurrency library for PHP.'),
 ('/topics/android',
  'Android is an operating system built by Google designed for mobile devices.'),
 ('/topics/angular', 'Angular is an open source web application platform.'),
 ('/topics/ansible', 'Ansible is a simple and powerful automation engine.'),
 ('/topics/api',
  'An API (Application Programming Interface) is a collection of protocols and subroutines for building software.'),
 ('/topics/arduino',
  'Arduino is an open source hardware and software company and maker community.'),
 ('/topics/aspnet',
  'ASP.NET is a web framework for building modern web apps and services.'),
 ('/topics/atom',
  'Atom is 

In [5]:
def get_repo(topic_url, nb_repo, desc=True):
    repos_links=[]
    
    if desc:
        order="desc"
    else:
        order="asc"
    
    url=f"https://github.com/{topic_url}?o={order}&s=stars&l=python&page="
        
    for i in tqdm(range(1+(nb_repo//30))):
        pr=requests.get(url+str(i+1))
        pr_soup = BeautifulSoup(pr.text,'html.parser')
        #list of all topics in the page
        repos=pr_soup.find_all('a',{"class":"wb-break-word"})
        repos_links.extend([repo["href"] for repo in repos])
        
    return repos_links[:nb_repo]

In [6]:
get_repo("topics/covid-19", 34, desc=False)

  0%|          | 0/2 [00:00<?, ?it/s]

['/lucaslopes/covid19collab',
 '/ClownMonster/Covid_Data_WebScrapping',
 '/kobonyo/Africacovid-19',
 '/jasonli0616/hdsb-covid-api',
 '/flyme2bluemoon/COVID-19-in-Ontario-Schools',
 '/EimaMei/Covid-API',
 '/contykang/covidaroundyou',
 '/michalzembron/WebDataParser',
 '/marko4789/Visualizacion-Covid19-MX',
 '/kastnerp/SARS-CoV-2_AirborneDecay',
 '/1k-ct/kobe-COVID-19',
 '/CianHub/100DaysOfPythonDay6',
 '/idealius/transpose_covid_timeseries',
 '/tsarpaul/COVID19-Graph',
 '/chris20lee/COVID-Report',
 '/francBara/Covid19-Plotter',
 '/AdeelGH/panama_covid_gif',
 '/JulianChia/lowerboundSARSCOV2',
 '/StevenHuang2020/COVID-19-Statistics',
 '/amandeepsinghkhanna/covid19_web_scrapper',
 '/MilesSpence/HonorsCapstone',
 '/ykursadkaya/covid19-turkey-api',
 '/Vewaa/tCovid',
 '/chilija92/covid-19-hospital-situation',
 '/Ninjalice/Covid-Navarra',
 '/aahad91/Covid-19-Visual',
 '/narayanarajvj/Covid-Updates',
 '/Kira060200/unemployment-pre-and-post-covid-romania',
 '/amgsnt/covid19',
 '/zakiego/dataset-z

In [7]:
headers_list = [
    # Firefox 77 Mac
    {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    },
    # Firefox 77 Windows
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    },
    # Chrome 83 Mac
    {
        "Connection": "keep-alive",
        "DNT": "1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Dest": "document",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
    },
    # Chrome 83 Windows 
    {
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "?1",
        "Sec-Fetch-Dest": "document",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9"
    }
]
# Create ordered dict from Headers above
ordered_headers_list = []
for headers in headers_list:
    h = OrderedDict()
for header,value in headers.items():
    h[header]=value
    ordered_headers_list.append(h)

In [8]:
random.choice(headers_list)

{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 'Accept-Language': 'en-US,en;q=0.5',
 'Referer': 'https://www.google.com/',
 'DNT': '1',
 'Connection': 'keep-alive',
 'Upgrade-Insecure-Requests': '1'}

In [9]:
ordered_headers_list

[OrderedDict([('Connection', 'keep-alive'),
              ('Upgrade-Insecure-Requests', '1'),
              ('User-Agent',
               'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'),
              ('Accept',
               'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'),
              ('Sec-Fetch-Site', 'same-origin'),
              ('Sec-Fetch-Mode', 'navigate'),
              ('Sec-Fetch-User', '?1'),
              ('Sec-Fetch-Dest', 'document'),
              ('Referer', 'https://www.google.com/'),
              ('Accept-Encoding', 'gzip, deflate, br'),
              ('Accept-Language', 'en-US,en;q=0.9')]),
 OrderedDict([('Connection', 'keep-alive'),
              ('Upgrade-Insecure-Requests', '1'),
              ('User-Agent',
               'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8

In [10]:
class Repo_info():
    
    
#     # Create ordered dict from Headers above
#     self.ordered_headers_list = []
#     for headers in headers_list:
#         h = OrderedDict()
#     for header,value in headers.items():
#         h[header]=value
#         self.ordered_headers_list.append(h)

    def __init__(self, url):     
        
        self.commits_date=[]
        self.commits_message=[]
        self.files_path=[]
        self.code_path={}
        self.comment_list={}
        self.docstr_list={}
        
        self.code_metric={}
        self.halstead_metric={}
        self.halstead_vol={}
        self.cyclomatic_complexity={}
        self.number_of_LLOC={}
        self.percent_comment={}
        self.loc={}
        self.lloc={}
        self.sloc={}
        self.comment={}
        self.multi={}
        self.single_comments={}
        self.blank={}
        
        
        self.headers_list = [
            # Firefox 77 Mac
            {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
                "Referer": "https://www.google.com/",
                "DNT": "1",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1"
            },
            # Firefox 77 Windows
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
                "Accept-Encoding": "gzip, deflate, br",
                "Referer": "https://www.google.com/",
                "DNT": "1",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1"
            },
            # Chrome 83 Mac
            {
                "Connection": "keep-alive",
                "DNT": "1",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "Sec-Fetch-Site": "none",
                "Sec-Fetch-Mode": "navigate",
                "Sec-Fetch-Dest": "document",
                "Referer": "https://www.google.com/",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
            },
            # Chrome 83 Windows 
            {
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "Sec-Fetch-Site": "same-origin",
                "Sec-Fetch-Mode": "navigate",
                "Sec-Fetch-User": "?1",
                "Sec-Fetch-Dest": "document",
                "Referer": "https://www.google.com/",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en;q=0.9"
            }
        ]
        
        headers = random.choice(self.headers_list)
        #Create a request session
        self.r = requests.Session()
        self.r.headers = headers
            
        fork_level=0
        while True:
            self.partial_url=url
            self.url=f"https://github.com{self.partial_url}" 
            self.pr=self.r.get(self.url)
            self.pr_soup = BeautifulSoup(self.pr.text,'html.parser')  # creating beautiful soup object  

            forked_src=self.pr_soup.find_all('a', {"data-hovercard-type":"repository"})
            
            if fork_level>5 or len(forked_src)==0:
                break
            else:
                url=forked_src[0]["href"]
                fork_level+=1
        
        # Numbers of stars : 
        self.stars=self.get_stars()
        #Numbers of contributors : 
        self.contributors=self.get_contributors()
        #Numbers of forks : 
        self.forks=self.get_forks()
        #Numbers of watchers : 
        self.watchers=self.get_watchers()
        #Numbers of pull request : 
        self.pull_request=self.get_pull_request()
        #Numbers of issues : 
        self.issues=self.get_issues()
        #Numbers closed bug issues
        self.closed_bug_issues=self.get_closed_bug_issues()
        #Numbers open bug issues
        self.open_bug_issues=self.get_open_bug_issues()
        #Numbers of tags : 
        self.tags=self.get_tags()
        #Tags_descriptor : 
        self.tags_descriptor=self.get_tags_descriptor()
        
        #Numbers of commits : 
        _=self.get_commits()
        
        #Zip file
        self.zip_repo=self.get_zip_repo()
        #list of files :
        _=self.get_files_path()
        #collect code :
        _=self.get_code()
        # for i in repo_info.get_files_path():
        #     print(f"  ==>  {i}")
        _=self.get_comment()
        #delta of time between the first and the last commit
        self.delta_time=(self.commits_date[0]-self.commits_date[-1]).days if len(self.commits_date)>2 else 0
        
        #Calculate Complexity
        _=self.get_metric()
        
        
    
    def get_commits(self):
        # nb commits
        main_branch=self.pr_soup.find_all('svg',{"class":"octicon-history"})[0].parent#'href':re.compile('/commits/master$')})
        self.main_branch_name=main_branch["href"].split('/')[-1]
        nb_commits=int(main_branch.text.strip().split("\n")[0].replace(",", ""))
        
        with tqdm(total=nb_commits) as pbar:
            pr=self.r.get(f'{self.url}/commits/{self.main_branch_name}')
            pr_soup = BeautifulSoup(pr.text,'html.parser')  # creating beautiful soup object  
            
            commits_date=[datetime.strptime(commit_date['datetime'].partition('T')[0], '%Y-%m-%d').date() for commit_date in pr_soup.find_all('relative-time')]
            self.commits_date.extend(commits_date)
            
            commits_message=[commit_message.text.replace("…", "").strip() for commit_message in pr_soup.find_all('p',{'class':'mb-1'})]
            self.commits_message.extend(commits_message)
            
            pbar.update(len(commits_date))
            
            while (len(next_page_links_list:=pr_soup.find_all(text=re.compile('Older')))>0 and
                   (next_page_link:=next_page_links_list[0].parent).name=="a"):
                
                pr =self.r.get(next_page_link["href"])  # creating beautiful soup object    
                pr_soup = BeautifulSoup(pr.text,'html.parser')  # creating beautiful soup object    
                
                commits_date=[datetime.strptime(commit_date['datetime'].partition('T')[0], '%Y-%m-%d').date() for commit_date in pr_soup.find_all('relative-time')]
                self.commits_date.extend(commits_date)
                
                commits_message=[commit_message.text.replace("…", "").strip() for commit_message in pr_soup.find_all('p',{'class':'mb-1'})]
                self.commits_message.extend(commits_message)
                
                pbar.update(len(commits_date))
                
        # print(f" => nb_commits = {nb_commits}")
        # print(f" => len(self.commits_date) = {len(self.commits_date)}")
        # print(f" => len(self.commits_message) = {len(self.commits_message)}")
        
        
        try:
            assert nb_commits == len(self.commits_date) == len(self.commits_message)
        except:
            print("  ***  get_commits  ***")
            print(f" => nb_commits = {nb_commits}")
            print(f" => len(self.commits_date) = {len(self.commits_date)}")
            print(f" => len(self.commits_message) = {len(self.commits_message)}")
            print("  ***  ")

        
        return len(self.commits_date)
        

    def get_first_commits(self):
        if len(self.commits_date)==0:
            self.get_commits()
        
        try:
            assert min(self.commits_date) == self.commits_date[-1]
            
        except:
            print("  ***  get_first_commits  ***")
            print(f"    => min(self.commits_date) : {min(self.commits_date)}")
            print(f"    => self.commits_date[-1] : {self.commits_date[-1]}")
            print("  ***  ")
            
        
        return min(self.commits_date)
    
    def get_last_commits(self):
        if len(self.commits_date)==0:
            self.get_commits()
        
        # locating & extracting tags for last committed time
        #last_commit_atag =pr_soup2.find_all('a',{'class':'Link--secondary ml-2'})
        #last_updated = last_commit_atag[0].find_all('relative-time')[0]['datetime'] if len(last_commit_atag)>=1 else None
        
        # dernier commit
        last_commits=datetime.strptime(self.pr_soup.find_all('relative-time')[0]['datetime'].partition('T')[0], '%Y-%m-%d').date()
        
        try:
            assert last_commits == max(self.commits_date) == self.commits_date[0]
        
        except:
            print("  ***  get_last_commits  ***")
            print(f"    => last_commits : {last_commits}")
            print(f"    => max(self.commits_date) : {max(self.commits_date)}")
            print(f"    => self.commits_date[0] : {self.commits_date[0]}")
            print("  ***  ")
            
        
        return max(self.commits_date)
    
    def numerize(self, s):
        multipliers = {'k': 10**3, 'm': 10**6, 'b': 10**9}

        if s[-1] in multipliers:
            return int(s[:-1]) * multipliers[s[-1]]
        else:
            return int(s)
    
    def get_watchers(self):
        watcher_tmp=self.pr_soup.find_all('a',{'href':re.compile('/watchers$')})
        watcher=watcher_tmp[0].text.strip().split("\n")[0]
        return self.numerize(watcher)
        
    def get_stars(self):
        # locating & extracting tags for star counts
        star_span_tag = self.pr_soup.find_all('span',{'id':'repo-stars-counter-star'})
        stars = int(star_span_tag[0]['aria-label'].split()[0])
        return stars
    
    def get_forks(self):
        # locating & extracting tags for forks counts
        forks_span_tag =self.pr_soup.find_all('span',{'id':'repo-network-counter'})
        forks = int(forks_span_tag[0]['title'].replace(',', ''))
        return forks
    
    def get_contributors(self):
        # Contributeurs
        tmp=self.pr_soup.find_all(text=re.compile("Contributors"))
        return int(tmp[0].next_element.text) if len(tmp)>0 else 0
    
    def get_tags(self):
        tags=self.pr_soup.find_all('a',{'href':re.compile('/tags$')})[1].find("strong").text
        return self.numerize(tags)
    
    def get_tags_descriptor(self):
        tags_descriptor=[x.text.strip() for x in self.pr_soup.find_all("a", {"class":"topic-tag"})]
        return tags_descriptor
        
    def get_branches(self):
        branches=self.pr_soup.find_all('a',{'href':re.compile('/branches$')})[1].find("strong").text
        return self.numerize(branches)
    
    def get_closed_bug_issues(self):
        pr=self.r.get(f"{self.url}/issues?q=is%3Aissue+label%3Abug")
        pr_soup = BeautifulSoup(pr.text,'html.parser')
        
        closed_bug_issues=pr_soup.find_all('a',{"data-ga-click":"Issues, Table state, Closed"})
        
        return int(closed_bug_issues[0].text.strip().split(" Closed")[0]) if len(closed_bug_issues)>0 else 0
    
    def get_open_bug_issues(self):
        pr=self.r.get(f"{self.url}/issues?q=is%3Aissue+label%3Abug")
        pr_soup = BeautifulSoup(pr.text,'html.parser')
        
        open_bug_issues=pr_soup.find_all('a',{"data-ga-click":"Issues, Table state, Open"})
        
        return int(open_bug_issues[0].text.strip().split(" Open")[0]) if len(open_bug_issues)>0 else 0
    
    def get_issues(self):
        issues=self.pr_soup.find_all('span',{'id':"issues-repo-tab-count"})
        return self.numerize(issues[0].text) if len(issues)>0 else None
    
    def get_pull_request(self):
        pull_requests=self.pr_soup.find_all('span',{'id':"pull-requests-repo-tab-count"})
        return self.numerize(pull_requests[0].text) if len(pull_requests)>0 else None
    
#     def get_files_path(self):
#         self.files_path=[]
#         directories_path=[f"{self.partial_url}/tree/{self.main_branch_name}/"]

#         for directory in directories_path:

#             pr=requests.get(f"https://github.com{directory}")
#             pr_soup = BeautifulSoup(pr.text,'html.parser')

#             self.files_path.extend([file_balise.parent.find_next('a',{'href':re.compile(f"{self.partial_url}/blob/*")})["href"] for file_balise in pr_soup.find_all('svg',{"aria-label":"File"})])

#             sub_directories=pr_soup.find_all('svg',{"aria-label":"Directory"})
#             if len(sub_directories)>0:
#                 directories_path.extend([sub_dir_balise.parent.find_next('a',{'href':re.compile(f"{self.partial_url}/tree/*")})["href"] for sub_dir_balise in sub_directories])
        
#         return self.files_path
    
#     def get_code(self, url=None):
#         if len(self.files_path)==0 and not url:
#             self.get_files_path()
        
#         for file_path in self.files_path:
#             if file_path.endswith("py"):
#                 # scrapper le code
#                 pr =requests.get(f"https://github.com{file_path}")  # request data    
#                 pr_soup = BeautifulSoup(pr.text,'html.parser')  # creating beautiful soup object 

#                 # script pour parser le code ...
#                 codes_tmp=pr_soup.find_all('td',{'class':'blob-code'})
#                 codes=[code.text for code in codes_tmp]
#                 self.code_path[file_path]=codes
#         return self.code_path


    # Mimic os.walk() function for zipfiles
    
    def get_zip_repo(self):
        zip_repo_url=f"{self.url}/archive/refs/heads/{self.main_branch_name}.zip"
        r = self.r.get(zip_repo_url)
        zip_repo = zipfile.ZipFile(io.BytesIO(r.content))
        return zip_repo
    
    def zipwalk(self, zfile):
        # Initialize database
        dlistdb = {}

        # Walk through zip file information list
        for info in zfile.infolist():
            if info.is_dir():
                zpath = os.path.dirname(os.path.dirname(info.filename).rstrip('/'))
                zfile = os.path.basename(os.path.dirname(info.filename).rstrip('/'))
                if zpath in dlistdb:
                    dlistdb[zpath][0].append(zfile)
                else:
                    dlistdb[zpath] = [[zfile], []]
            else:
                zpath = os.path.dirname(info.filename)
                zfile = os.path.basename(info.filename)
                if zpath in dlistdb:
                    dlistdb[zpath][1].append(zfile)
                else:
                    dlistdb[zpath] = [[], [zfile]]

        # Convert to os.walk() output format
        dlist = []
        for key in dlistdb.keys():
            dlist.append((key, dlistdb[key][0], dlistdb[key][1]))

        return iter(dlist)

    def get_files_path(self):
        self.files_path=[]
        
        db = self.zipwalk(self.zip_repo)
        #print(list(db))
        for root, dirs, files in db:#os.walk(z, topdown=False):
            for name in files:
                self.files_path.append("/".join([root, name]))#os.path.join(root, name))
                
        return self.files_path

    def get_code(self, url=None):
        if len(self.files_path)==0 and not url:
            self.get_files_path()
        
        for file_path in self.files_path:
            if file_path.endswith("py"):
                with io.TextIOWrapper(self.zip_repo.open(file_path), encoding= 'cp1252') as f: #encoding = 'utf-8'
                    self.code_path[file_path]=f.read()
        return self.code_path
        
    def get_comment(self, url=None):
        if len(self.code_path)==0 and not url:
            self.get_code()
        
        for file_path, code in self.code_path.items():
            if file_path.endswith("py"):
                #self.get_code(file_path)
                #try:
                
                self.comment_list[file_path]=re.findall('(?<=#).*$', code, re.M)

                pattern=re.compile(r'''(['"])\1\1(.*?)\1{3}''', re.DOTALL)
                self.docstr_list[file_path]=[docstring_content for docstring_type, docstring_content in re.findall(pattern, code)]
                
#                 with io.StringIO(f"{os.linesep}".join(code)) as file:
#                     try:
#                         for toktype, tok, start, end, line in tokenize.generate_tokens(file.readline):
#                             # we can also use token.tok_name[toktype] instead of 'COMMENT'
#                             # from the token module 
#                             line_clean=line.strip()
#                             if toktype == tokenize.STRING and (line_clean[:3]=="'''" or line_clean[:3]=='"""'):
#                             #if toktype == tokenize.STRING and start[1]==0:
#                                 #tmp=line[:start[1]].strip()
#                                 #if (len(tmp)==0) or (len(tmp)>0 and not tmp[-1]=="="):

#                                 #print('DOCSTRING' + " " + tok)
#                                 self.docstr_list[file_path].append(tok)

#                             elif toktype == tokenize.COMMENT:
#                                 #print('COMMENT' + "  ___  " + tok)
#                                 self.comment_list[file_path].append(tok)
#                     except Exception as e:
#                         print(f"\033[38;2;{255};{0};{0}m {e} \033[38;2;255;255;255m")
#                     #    print(f"\033[38;2;{255};{0};{0}m error in {file_path} \033[38;2;255;255;255m")
                
        return self.docstr_list, self.comment_list   
    
    def get_metric(self):
        for file_path, code in self.code_path.items():
            try:
                metric=radon.metrics.mi_parameters(code, count_multi=True)
                #print("pass metric")
            except Exception:
                metric=(None, None, None, None)
                print("fail metric")
               
                
            try:
                metric1=cc_visit(code)#.complexity
                #print("pass metric1")
            except Exception:
                metric1=None
                print("fail metric1")
                
                
            try:
                metric2=radon.metrics.h_visit(code)
                #print("pass metric2")
            except Exception:
                metric2=None
                print("fail metric2")
                
                
            try:
                metric3=radon.raw.analyze(code)
                #print("pass metric3")
            except Exception:
                metric3=(None, None, None, None, None, None, None)
                print("fail metric3")
               # print(traceback.format_exc())
            
                
            self.halstead_vol[file_path]=metric[0],
            self.cyclomatic_complexity[file_path]=metric[1],
            self.number_of_LLOC[file_path]=metric[2],
            self.percent_comment[file_path]=metric[3]
            
            self.code_metric[file_path]=metric1
            self.halstead_metric[file_path]=metric2
            
            self.loc[file_path]=metric3[0]
            self.lloc[file_path]=metric3[1]
            self.sloc[file_path]=metric3[2]
            self.comment[file_path]=metric3[3]
            self.multi[file_path]=metric3[4]
            self.single_comments[file_path]=metric3[5]
            self.blank[file_path]=metric3[6]
            
        return self.halstead_vol, self.cyclomatic_complexity, self.number_of_LLOC, self.percent_comment, self.code_metric, self.halstead_metric, self.loc, self.lloc, self.sloc, self.comment, self.multi, self.single_comments, self.blank
        
    def __str__(self):
                
        tmp="""list of files :"""
        for i in self.files_path:
            tmp+=f"\n  ==>  {i}"
        return f"""
        Numbers of stars : {self.stars}
        Numbers of contributors : {self.contributors}
        Numbers of forks : {self.forks}
        Numbers of watchers : {self.watchers}
        Numbers of pull request : {self.pull_request}
        Numbers of issues : {self.issues}
        Numbers of tags : {self.tags}
        Tags_descriptor : {self.tags_descriptor}
        Numbers of commits : {len(self.commits_date)}
        Date of first commits : {self.commits_date[-1]}
        Date of last commits : {self.commits_date[0]}
        
        """ + tmp
        
    def __as_dict__(self):
        return {k: v for k, v in self.__dict__.items() if k not in set({"pr_soup", "pr", "zip_repo", "r"})}

In [11]:
repo_info=Repo_info("/helmy-elrais/RoBERT_Recurrence_over_BERT")

  0%|          | 0/38 [00:00<?, ?it/s]

In [32]:
%%time
repo_topics={}

for topic, topic_description in get_topics(1000):

    repo_list=[]
    print(f" *** topic : {topic} ***")

    # repo avec le moins d'étoiles
    for repo in get_repo(topic_url=topic, nb_repo=6, desc=False):
        
        try:
            print(f"    => repo asc : {repo}")
            repo_list.append(Repo_info(repo))
        except Exception:
            print(traceback.format_exc())
            continue


        #repo avec le plus d'étoiles
    #for repo in get_repo(topic_url=topic, nb_repo=5, desc=True):
     #   try:
      #      print(f"    => repo asc : {repo}")
       #     repo_list.append(Repo_info(repo))
        #except Exception:
         #   pass
          #  print(traceback.format_exc())
           # continue


    repo_topics[topic]=repo_list

  0%|          | 0/34 [00:00<?, ?it/s]

Wall time: 3.01 s


In [30]:
# si le code est vide ==> bloque !!!

In [31]:
df_repo_topics2=pd.DataFrame([{"topic": topic, **repo_info.__as_dict__()} 
                             for topic, repos in repo_topics.items() for repo_info in repos])
df_repo_topics2.to_json("./repo_topic.json")

In [26]:
df_repo_topics2.to_csv('data code and metrics github2')

In [28]:
df_repo_topics2

In [33]:
%%time

for topic, topic_description in get_topics(500):

    # repo avec le plus d'étoiles
    for repo in get_repo(topic_url=topic, nb_repo=5, desc=True):
        
        try:
            print(f"    => repo asc : {repo}")
            repo_list.append(Repo_info(repo))
        except Exception:
            print(traceback.format_exc())
            continue


  0%|          | 0/17 [00:00<?, ?it/s]

Wall time: 1.42 s


In [25]:
df_repo_topics=pd.DataFrame([{"topic": topic, **repo_info.__as_dict__()} 
                             for topic, repos in repo_topics.items() for repo_info in repos])
df_repo_topics.to_json("./repo_topic.json")

In [26]:
df_repo_topics.explode('halstead_metric')[['partial_url','halstead_metric']]

Unnamed: 0,partial_url,halstead_metric
0,/fwidmaier/mesh_handler,mesh_handler-2/main.py
0,/fwidmaier/mesh_handler,mesh_handler-2/wireframe.py
0,/fwidmaier/mesh_handler,mesh_handler-2/linalg/__init__.py
0,/fwidmaier/mesh_handler,mesh_handler-2/mesh/__init__.py
0,/fwidmaier/mesh_handler,mesh_handler-2/mesh/obj.py
...,...,...
43,/codingforentrepreneurs/eCommerce,eCommerce-master/src/tags/shell_commands.py
43,/codingforentrepreneurs/eCommerce,eCommerce-master/src/tags/tests.py
43,/codingforentrepreneurs/eCommerce,eCommerce-master/src/tags/views.py
43,/codingforentrepreneurs/eCommerce,eCommerce-master/src/tags/migrations/0001_init...


In [25]:
df_repo_topics.shape

(21, 35)

In [None]:
from sklearn.metrics import f1_score
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]
f1_score(y_true, y_pred, average='macro')

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import math

In [2]:
a=[11, 30, 100, 41, 19, 0, 10, 55, 78, 90, -10, -11, 6, 105]
b={}
c=""

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
a=[[11], [30], [100], [41], [19], [0], [10], [55], [78], [90], [-10], [-11], [6], [105]]
print(scaler.fit(a))
print(scaler.transform(a))

StandardScaler()
[[-0.66606688]
 [-0.1872188 ]
 [ 1.57695833]
 [ 0.09000904]
 [-0.46444663]
 [-0.94329471]
 [-0.69126941]
 [ 0.44284446]
 [ 1.02250266]
 [ 1.32493303]
 [-1.19532002]
 [-1.22052255]
 [-0.79207953]
 [ 1.70297099]]


In [5]:
 from sklearn.preprocessing import MinMaxScaler
#data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
a=[[11], [30], [100], [41], [19], [0], [10], [55], [78], [90], [-10], [-11], [6], [105]]
scaler = MinMaxScaler()
print(scaler.fit(a))
print(scaler.transform(a))

MinMaxScaler()
[[0.18965517]
 [0.35344828]
 [0.95689655]
 [0.44827586]
 [0.25862069]
 [0.09482759]
 [0.18103448]
 [0.56896552]
 [0.76724138]
 [0.87068966]
 [0.00862069]
 [0.        ]
 [0.14655172]
 [1.        ]]


In [56]:
type(print(5.89))

5.89


NoneType

In [58]:
def f():
    print("ok")
type(f())

ok


NoneType

In [62]:
(e,r,t)=(1,3,4)

In [63]:
e

1

In [64]:
new=10
def f(val):
    new=val
    return val
f(3)
new

10

In [65]:
raisee=lambda x, y : y **2

In [66]:
raisee(2,3)

9

In [67]:
po="Da"
o=iter(po)
next(o)

'D'

In [69]:
print(*iter("Lincold"))

L i n c o l d


In [70]:
mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},

          {'a': 100, 'b': 200, 'c': 300, 'd': 400},

          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]

df = pd.DataFrame(mydict)

In [73]:
type(df[["a"]])

pandas.core.frame.DataFrame