In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
from env import github_token, github_username
import acquire
import pandas as pd
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import regex as re
import time
import numpy as np
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud


In [None]:
def create_urls(num=5000):
    ''' this function scrapes the cryptography repositories from github and returns a list of urls
    '''
    num_of_repos=num

    page_numbers = [i for i in range(0,101)]
    print(page_numbers)
    urls = [f'https://github.com/search?p={i}&q=%23defi&type=Repositories&per_page=100' for i in page_numbers]

    print(urls)
    return urls

##### create_urls()

In [None]:
def get_endpoints(url):
    ''' This function gets the endpoints from the list of above urls
    '''

    headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}
    
    while True:
        response = requests.get(url, headers=headers)
        if response.ok:
            break
        else:
            print('sleeping')
            time.sleep(20)
            continue
    soup = BeautifulSoup(response.text)
    
    print(response.ok)

    endpoints = []
    subgroups = soup.find_all('div', {"class":"f4 text-normal"})

    for group in subgroups:
        endpoints.append(re.search('href=".*"', str(group))[0][6:-1])

    return endpoints

In [None]:
get_endpoints('https://github.com/search?p=100&q=%23defi&type=Repositories&per_page=100')

In [None]:
def make_all_endpoints():
    ''' This function returns all of the endpoints
    '''
    urls = create_urls()
    for url in urls:
        print(url)
    all_endpoints = []

    for i, page in enumerate(urls):
        all_endpoints.append(get_endpoints(page))
        print(page)

    print(len(all_endpoints))

    return all_endpoints

In [None]:
make_all_endpoints()

In [None]:
def acquire_endpoints():
    ''' This function acquires all endpoints and writes them to a csv.
    '''
    our_endpoints = pd.Series(make_all_endpoints(), name='endpoints')
    our_endpoints.to_csv('endpoints.csv', index=False)

    return our_endpoints

In [None]:
x=acquire_endpoints()

In [None]:
def flatten_endpoints():
    ''' This function flattens a 2d array into a 1d array
    '''
    end_points = pd.read_csv('endpoints.csv')
    all_values = []
    for value in end_points.values:
        for ep in value:
            all_values.append(ep)

    final_values = []
    #print(all_values)
    for value in all_values:
        for val in value.split("'"):
            if len(val) > 3:
                final_values.append(val)
                print(val)

    return pd.Series(final_values, name='endpoints')

In [None]:
endpoints = flatten_endpoints()

In [None]:
REPOS = list(endpoints)

In [None]:
REPOS[5:]

In [None]:

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    ''' This function makes requests from github and raises an error code if a specific error code is received.
    '''
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    ''' This function fetches the language associated with a repository
    '''
    url = f"https://api.github.com/repos{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )

def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    ''' This function fetches the contents associated with a repo
    '''
    url = f"https://api.github.com/repos{REPOS}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )

def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""

def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        response = requests.get(readme_download_url)
        print(readme_download_url)
        print(response.status_code)
        readme_contents = requests.get(readme_download_url).text
        
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


In [None]:
def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    
    output = []
    for repo in REPOS:
        
        url = f"https://api.github.com/repos{repo}/contents/"
        print(repo)
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            print(f"Skipping {repo} because its HTTP status code is {response.status_code}")
            continue
        
        contents = response.json()
        readme_download_url = get_readme_download_url(contents)
        
        if readme_download_url == "":
            readme_contents = ""
        else:
            response = requests.get(readme_download_url)
            if response.status_code != 200:
                print(f"Skipping {repo} because its HTTP status code is {response.status_code}")
                continue
            readme_contents = requests.get(readme_download_url).text
        print(repo)
        result = {
            "repo": repo,
            "language": get_repo_language(repo),
            "readme_contents": readme_contents,
        }
    
        output.append(result)
        
    return output

In [None]:
data = scrape_github_data()

In [None]:
data[:5]

In [None]:
import csv
to_csv = data
keys = to_csv[0].keys()

with open('data.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(to_csv)

In [2]:
df = pd.read_csv('data.csv') 
df.head()

Unnamed: 0,repo,language,readme_contents
0,/OffcierCia/DeFi-Developer-Road-Map,,# DeFi Developer Road Map\n\n**Here we collect...
1,/smartcontractkit/full-blockchain-solidity-cou...,,<!-- [YouTube Video](https://www.youtube.com/w...
2,/rainbow-me/rainbow,TypeScript,![](https://pbs.twimg.com/profile_banners/1103...
3,/Bytom/bytom,Go,Bytom\n======\n\n[![Build Status](https://trav...
4,/DimensionDev/Maskbook,TypeScript,<!-- cspell:disable -->\n<!-- markdownlint-dis...


In [3]:
df.language.value_counts()

JavaScript          239
TypeScript          224
Solidity            168
Python               58
Rust                 32
HTML                 26
Go                   25
C++                  13
CSS                  13
Vue                  12
Shell                12
Jupyter Notebook     10
C#                   10
SCSS                  5
Java                  4
Kotlin                4
Swift                 3
Racket                2
Elixir                2
Clojure               2
Scala                 1
Dockerfile            1
TeX                   1
Haskell               1
Ruby                  1
Nim                   1
Motoko                1
Vyper                 1
Cairo                 1
Elm                   1
PHP                   1
Clarity               1
Tcl                   1
q                     1
PLpgSQL               1
Svelte                1
Name: language, dtype: int64

### Prepare

In [4]:
df.isnull().sum()

repo                 0
language           130
readme_contents     34
dtype: int64

In [5]:
df.shape

(1010, 3)

In [6]:
df = df.dropna()
df.shape

(851, 3)

In [7]:
def basic_clean(string):
    ''' Receives a string of text, processes it & then returns its normalized version.
    Normalization via standard NKFD unicode, fed into an ASII encoder & decoded back into UTF-8.
    '''
    string = string.lower()
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    string = re.sub(r"[^a-z0-9'\s]", ' ', string)
    return string

In [8]:
df['readme_contents'] = df.readme_contents.apply(basic_clean)
df.head()

Unnamed: 0,repo,language,readme_contents
2,/rainbow-me/rainbow,TypeScript,https pbs twimg com profile banners 1103...
3,/Bytom/bytom,Go,bytom\n \n\n build status https trav...
4,/DimensionDev/Maskbook,TypeScript,cspell disable \n markdownlint dis...
5,/ccyanxyz/uniswap-arbitrage-analysis,Python,uniswap arbitrage analysis\n\n see readme ...
6,/chainsulting/Smart-Contract-Security-Audits,HTML,smart contracts audits by chainsulting\nsmar...


In [9]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    string = tokenizer.tokenize(string, return_str=True)
    return string

In [10]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    article_lemmatized = ' '.join(lemmas)

    return article_lemmatized

In [11]:
def remove_stopwords(string, extra_words=None, exclude_words=None):
    
    stopword_list = stopwords.words('english')
    
    if exclude_words:
        
        stopword_list = stopword_list + exclude_words
        
    if extra_words:
        
        for word in extra_words:
            
            stopword_list.remove(word)
            
    words = string.split()
    
    filtered_words = [word for word in words if word not in stopword_list]
    
    filtered_string = ' '.join(filtered_words)
    
    return filtered_string


In [12]:
df['readme_contents'] = df.readme_contents.apply(tokenize).apply(lemmatize).apply(remove_stopwords)

In [13]:
df['word_count'] = df['readme_contents'].apply(lambda x : len(x.split()))

In [14]:
df['character_count'] = df['readme_contents'].apply(lambda x : len(x.replace(" ","")))
df.head()

Unnamed: 0,repo,language,readme_contents,word_count,character_count
2,/rainbow-me/rainbow,TypeScript,http pb twimg com profile banner 1103191459409...,326,1825
3,/Bytom/bytom,Go,bytom build status http travis ci org bytom by...,742,3859
4,/DimensionDev/Maskbook,TypeScript,cspell disable markdownlint disable inline htm...,471,2572
5,/ccyanxyz/uniswap-arbitrage-analysis,Python,uniswap arbitrage analysis see readme en pdf 0...,994,4263
6,/chainsulting/Smart-Contract-Security-Audits,HTML,smart contract audit chainsulting smart contra...,193,1154


In [15]:
df.isnull().sum()

repo               0
language           0
readme_contents    0
word_count         0
character_count    0
dtype: int64