In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from env import github_token, github_username
from most_forked_repos import repos

# Acquire

In [2]:
REPOS = repos

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )

In [3]:
def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data

In [4]:
def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )

In [5]:
def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )

In [7]:
def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""

In [8]:
def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    print(repo)
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }

In [10]:
def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]

In [11]:
if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data.json", "w"), indent=1)

jtleek/datasharing
rdpeng/ProgrammingAssignment2
octocat/Spoon-Knife
tensorflow/tensorflow
SmartThingsCommunity/SmartThingsPublic
twbs/bootstrap
github/gitignore
Pierian-Data/Complete-Python-3-Bootcamp
nightscout/cgm-remote-monitor
rdpeng/ExData_Plotting1
jwasham/coding-interview-university
opencv/opencv
EbookFoundation/free-programming-books
CyC2018/CS-Notes
tensorflow/models
eugenp/tutorials
jackfrued/Python-100-Days
firstcontributions/first-contributions
torvalds/linux
Snailclimb/JavaGuide
rdpeng/RepData_PeerAssessment1
facebook/react
spring-projects/spring-boot
jlord/patchwork
TheAlgorithms/Python
barryclark/jekyll-now
spring-projects/spring-framework
DataScienceSpecialization/courses
ant-design/ant-design
vuejs/vue
github/docs
getify/You-Dont-Know-JS
angular/angular.js
donnemartin/system-design-primer
freeCodeCamp/freeCodeCamp
DefinitelyTyped/DefinitelyTyped
PanJiaChen/vue-element-admin
django/django
kamranahmedse/developer-roadmap
apache/spark
django/django
kamranahmedse/develope

angular-ui/bootstrap
dotnet/aspnetcore
RocketChat/Rocket.Chat
hashicorp/terraform
macrozheng/mall-learning
moment/moment
square/retrofit
yiisoft/yii2
cocos2d/cocos2d-x
521xueweihan/HelloGitHub
microsoft/terminal
videojs/video.js
seata/seata
arduino/Arduino
JeffLi1993/springboot-learning-example
romeokienzler/TensorFlow
QSCTech/zju-icicles
elastic/kibana
Alamofire/Alamofire
apache/incubator-mxnet
halo-dev/halo
php/php-src
impress/impress.js
NixOS/nixpkgs
fchollet/deep-learning-with-python-notebooks
yunjey/pytorch-tutorial
amjuarez/bytecoin
microsoft/sql-server-samples
sentsin/layui
xamarin/xamarin-forms-samples
Micropoor/Micro8
ColorlibHQ/gentelella
donnemartin/data-science-ipython-notebooks
Homebrew/brew
dcloudio/mui
amix/vimrc
vnpy/vnpy
chrislgarry/Apollo-11
CMU-Perceptual-Computing-Lab/openpose
phonegap/phonegap-start
learn-co-students/py-lists-with-maps-data-science-intro-000
storybookjs/storybook
iperov/DeepFaceLab
floodsung/Deep-Learning-Papers-Reading-Roadmap
OpenZeppelin/openzep

# Prepare

In [12]:
import pandas as pd
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

In [13]:
def basic_clean(some_string):
    '''
    Takes in a string and makes all characters lowercased, normalizes unicode characters, and removes any character that is not a letter, number, ', or space
    '''
    some_string = some_string.lower()
    some_string = unicodedata.normalize('NFKD', some_string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    some_string = re.sub(r"[^a-z0-9'\s]", '', some_string)
    return some_string

In [14]:
def tokenize(some_string):
    '''
    Takes in a string and tokenizes it
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    some_string = tokenizer.tokenize(some_string, return_str = True)
    return some_string

In [15]:
def remove_stopwords(some_string, extra_words = [], exclude_words = []):
    '''
    Takes in a string and removes stopwords in nltk stopwords list, user can also pass list of words to add or remove from default list of stopwords
    '''
    stopword_list = stopwords.words('english')
    [stopword_list.append(word) for word in extra_words]
    [stopword_list.remove(word) for word in extra_words]
    words = some_string.split()
    filtered_words = [word for word in words if word not in stopword_list]
    some_string_without_stopwords = ' '.join(filtered_words)
    return some_string_without_stopwords

In [16]:
def lemmatize(some_string):
    '''
    Takes in a string and returns a string with all words lemmatized
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in some_string.split()]
    some_string_lemmatized = ' '.join(lemmas)
    return some_string_lemmatized

In [17]:
def stem(some_string):
    '''
    Takes in a string and returns a string with all words stemmed
    '''
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in some_string.split()]
    some_string_stemmed = ' '.join(stems)
    return some_string_stemmed

In [18]:
def prep_nlp(df, original_text_col = 'original', extra_words = [], exclude_words = []):
    '''
    Take in df with raw content, create new columns for cleaned content, stemmed content, and lemmatized content
    '''
    df['clean'] = df[original_text_col].apply(basic_clean).apply(tokenize).apply(remove_stopwords, extra_words, exclude_words)
    df['stemmed'] = df.clean.apply(stem)
    df['lemmatized'] = df.clean.apply(lemmatize)
    df = df.dropna()
    top4 = ['JavaScript', 'Java', 'Python', 'C++']
    df.language = df.language.apply(lambda x: 'Other' if x not in top4 else x)
    df = df[df.language != 'Other']
    return df

In [19]:
def split_readme(df):
    '''
    This function takes in a df and splits it into train, validate, and test dfs
    final proportions will be 60/20/20 for train/validate/test
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.language)
    train, validate = train_test_split(train_validate, test_size=.25, random_state=123, stratify=train_validate.language)
    train_prop = train.shape[0] / df.shape[0]
    val_prop = validate.shape[0] / df.shape[0]
    test_prop = test.shape[0]/df.shape[0]
    print(f'Train Proportion: {train_prop:.2f} ({train.shape[0]} rows)\nValidate Proportion: {val_prop:.2f} ({validate.shape[0]} rows)\
    \nTest Proportion: {test_prop:.2f} ({test.shape[0]} rows)')
    return train, validate, test

# Practice Acquire and Prepare

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import strftime
import numpy as np

import scrape_url_list as s
import acquire as a
import prepare as p

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [21]:
links = s.get_cached_links()
links

['https://github.com/josephmisiti/awesome-machine-learning',
 'https://github.com/microsoft/ML-For-Beginners',
 'https://github.com/bfortuner/ml-glossary',
 'https://github.com/trekhleb/homemade-machine-learning',
 'https://github.com/dangkhoasdc/awesome-ai-residency',
 'https://github.com/Spandan-Madan/DeepLearningProject',
 'https://github.com/roboticcam/machine-learning-notes',
 'https://github.com/ZuzooVn/machine-learning-for-software-engineers',
 'https://github.com/eugeneyan/applied-ml',
 'https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap',
 'https://github.com/khangich/machine-learning-interview',
 'https://github.com/EthicalML/awesome-production-machine-learning',
 'https://github.com/Machine-Learning-Tokyo/Interactive_Tools',
 'https://github.com/GokuMohandas/MadeWithML',
 'https://github.com/ml-tooling/best-of-ml-python',
 'https://github.com/yandexdataschool/Practical_RL',
 'https://github.com/visenger/awesome-mlops',
 'https://github.com/aladdinpersson/Machi

In [22]:
# The [:-2] is to get rid of duplicate links at the very end of the list
links = links[:-2]
links

['https://github.com/josephmisiti/awesome-machine-learning',
 'https://github.com/microsoft/ML-For-Beginners',
 'https://github.com/bfortuner/ml-glossary',
 'https://github.com/trekhleb/homemade-machine-learning',
 'https://github.com/dangkhoasdc/awesome-ai-residency',
 'https://github.com/Spandan-Madan/DeepLearningProject',
 'https://github.com/roboticcam/machine-learning-notes',
 'https://github.com/ZuzooVn/machine-learning-for-software-engineers',
 'https://github.com/eugeneyan/applied-ml',
 'https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap',
 'https://github.com/khangich/machine-learning-interview',
 'https://github.com/EthicalML/awesome-production-machine-learning',
 'https://github.com/Machine-Learning-Tokyo/Interactive_Tools',
 'https://github.com/GokuMohandas/MadeWithML',
 'https://github.com/ml-tooling/best-of-ml-python',
 'https://github.com/yandexdataschool/Practical_RL',
 'https://github.com/visenger/awesome-mlops',
 'https://github.com/aladdinpersson/Machi

In [None]:
# I'm not going to test this cell because it would take too long and might get blocked by Github

# # this is the function that I used to get the top 500 most forked repos on github
# from time import sleep
# cards_list = []
# link_list = []
# for i in range(1,51):
#     url = 'https://github.com/search?o=desc&p={}&q=stars%3A%3E1&s=forks&type=Repositories'
#     url = url.format(i)
#     response = requests.get(url, headers={"user-agent": "Codeup"})
#     print(response.status_code)
#     soup = BeautifulSoup(response.text)
#     cards = soup.select('.repo-list-item')
#     cards_list.append(cards)
#     #print(i)
#     #print(url)
#     #print(len(cards))
#     #print(len(cards_list))
#     for card in cards:
#         link = card.select_one('.v-align-middle').attrs['href']
#         link_list.append(link)
#     sleep(30)
# [link[1:] for link in link_list]

In [28]:
# Get an error because this relies upon the previous cell, which was not run

len(link_list)

NameError: name 'link_list' is not defined

In [24]:
# Skip ahead to where data was put into a cached dataframe

# Create a dataframe that gives repo name, language, and readme
df = pd.read_json('data.json')
df.head()

Unnamed: 0,repo,language,readme_contents
0,jtleek/datasharing,,How to share data with a statistician\n=======...
1,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...
2,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...
3,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www...."
4,SmartThingsCommunity/SmartThingsPublic,Groovy,# SmartThings Public GitHub Repo\n\nAn officia...


In [26]:
df.shape

(500, 3)

In [27]:
# This pulls the row with 0 index, and the column "readme_contents"
df.loc[0 ,'readme_contents']



In [30]:
df.language.value_counts(dropna=False)

JavaScript          100
Java                 77
NaN                  60
Python               54
C++                  30
TypeScript           26
Jupyter Notebook     26
Go                   19
HTML                 16
C                    15
PHP                  12
Ruby                 11
Shell                 8
Kotlin                5
CSS                   5
C#                    4
Swift                 4
Rascal                3
Vue                   3
Scala                 2
Objective-C           2
Nix                   2
Dart                  2
R                     2
Assembly              1
ApacheConf            1
SCSS                  1
Dockerfile            1
TeX                   1
PowerShell            1
Vim script            1
Rich Text Format      1
Nunjucks              1
Groovy                1
Less                  1
Rust                  1
Name: language, dtype: int64

In [32]:
# There a bunch of nulls for language, but not for the other features

df.isna().sum()

repo                0
language           60
readme_contents     0
dtype: int64

In [33]:
# This is a list of the top 4 repo names
top4 = ['JavaScript', 'Java', 'Python', 'C++']

# This places all repos not in the top 4 into the 'other' catagory
df.language = df.language.apply(lambda x: 'Other' if x not in top4 else x)

In [34]:

df.language.value_counts()

Other         239
JavaScript    100
Java           77
Python         54
C++            30
Name: language, dtype: int64

In [35]:
# Drop all rows where language put into 'other' category

df = df[df.language != 'Other']

In [37]:
# Split the data

train, validate, text = p.split_readme(df)

train.shape, validate.shape, text.shape

Train Proportion: 0.60 (156 rows)
Validate Proportion: 0.20 (52 rows)    
Test Proportion: 0.20 (53 rows)


((156, 3), (52, 3), (53, 3))

In [None]:
def show_counts_and_ratios(df, column):
    labels = pd.concat([df[column].value_counts(),
                       df[column].value_counts(normalize=True)], axis)