# Hey README, what language is your program? 

In [1]:
# data wrangling
import pandas as pd
import numpy as np
import json

#visuals
import matplotlib.pyplot as plt
import seaborn as sns

# modules
import acquire
import prepare

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Acquire

A list of 100 pages to scrape are generated using the `acquire.loop_through_urls` function. They are saved to a csv and that csv file is being read for further manipulation. 

In [2]:
# to_scrape = acquire.loop_through_urls()
# to_scrape = pd.DataFrame(to_scrape)
# to_scrape.to_csv("URL_list_100_final.csv")

In [3]:
# read in list of pages generated from function 
pd.read_csv("URL_list_100.csv", header=0, names=["page"])

Unnamed: 0,page
0,http://github.com/freeCodeCamp/freeCodeCamp
1,http://github.com/996icu/996.ICU
2,http://github.com/vuejs/vue
3,http://github.com/facebook/react
4,http://github.com/tensorflow/tensorflow
...,...
95,http://github.com/ansible/ansible
96,http://github.com/kdn251/interviews
97,http://github.com/gatsbyjs/gatsby
98,http://github.com/opencv/opencv


The dataframe of 100 pages that was just created is looped through, and on each page, it's respective repository, language, and readme is extracted as a dictionary, using the `acquire.make_corpus` function. 

The `acquire.get_corpus` fuction checks to see if the data file is in the cache. If it is, it reads it in as a dataframe, if file is not in cache, the above mentioned function is run to generate it.

In [23]:
df_init = acquire.get_corpus()

In [24]:
df_init

Unnamed: 0,repo,language,readme
0,freeCodeCamp/freeCodeCamp,JavaScript,\n\n\n\n\n\nfreeCodeCamp.org's open-source cod...
1,996icu/996.ICU,Rust,\n996.ICU\nPlease note that there exists NO ot...
2,vuejs/vue,JavaScript,\n\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue...
3,facebook/react,JavaScript,\nReact · \nReact is a JavaScript library f...
4,tensorflow/tensorflow,C++,\n\n\n\n\n\n\nDocumentation\n\n\n\n\n\n\n\n\nT...
...,...,...,...
95,ansible/ansible,Python,\n\n \n \n \n \n \n\n\nAnsible\nAnsible is a ...
96,kdn251/interviews,Java,\nInterviews\n\nYour personal guide to Softwar...
97,gatsbyjs/gatsby,JavaScript,\n\n\n\n\n\n\n Gatsby v2\n\n\n⚛️ 📄 🚀\n\n\n F...
98,opencv/opencv,C++,\nOpenCV: Open Source Computer Vision Library\...


## Prepare

In [25]:
# make a copy for prepping and exploring
df = df_init.copy()

In [26]:
df.language.value_counts(dropna=False)

JavaScript          34
Python              14
None                12
Java                 8
C++                  6
TypeScript           6
Go                   4
C                    2
Vue                  2
Rust                 2
CSS                  1
HTML                 1
C#                   1
Dart                 1
Assembly             1
Jupyter Notebook     1
Ruby                 1
PHP                  1
Clojure              1
Shell                1
Name: language, dtype: int64

In [20]:
prepare.prep_readme(df)

Unnamed: 0,repo,language,original,stemmed,lemmatized,clean
0,freeCodeCamp/freeCodeCamp,JavaScript,\n\n\n\n\n\nfreeCodeCamp.org's open-source cod...,freecodecamporg' opensourc codebas and curricu...,freecodecamporg's opensource codebase and curr...,freecodecamporg's opensource codebase curricul...
1,996icu/996.ICU,Rust,\n996.ICU\nPlease note that there exists NO ot...,996icu pleas note that there exist no other of...,996icu please note that there exists no other ...,996icu please note exists official account app...
2,vuejs/vue,JavaScript,\n\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue...,support vuej vuej is an mitlicens open sourc p...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...
3,facebook/react,JavaScript,\nReact · \nReact is a JavaScript library f...,react react is a javascript librari for build ...,react react is a javascript library for buildi...,react react javascript library building user i...
4,tensorflow/tensorflow,C++,\n\n\n\n\n\n\nDocumentation\n\n\n\n\n\n\n\n\nT...,document tensorflow is an endtoend open sourc ...,documentation tensorflow is an endtoend open s...,documentation tensorflow endtoend open source ...
...,...,...,...,...,...,...
95,ansible/ansible,Python,\n\n \n \n \n \n \n\n\nAnsible\nAnsible is a ...,ansibl ansibl is a radic simpl it autom system...,ansible ansible is a radically simple it autom...,ansible ansible radically simple automation sy...
96,kdn251/interviews,Java,\nInterviews\n\nYour personal guide to Softwar...,interview your person guid to softwar engin te...,interview your personal guide to software engi...,interviews personal guide software engineering...
97,gatsbyjs/gatsby,JavaScript,\n\n\n\n\n\n\n Gatsby v2\n\n\n⚛️ 📄 🚀\n\n\n F...,gatsbi v2 fast in everi way that matter gatsbi...,gatsby v2 fast in every way that matter gatsby...,gatsby v2 fast every way matters gatsby free o...
98,opencv/opencv,C++,\nOpenCV: Open Source Computer Vision Library\...,opencv open sourc comput vision librari resour...,opencv open source computer vision library res...,opencv open source computer vision library res...


In [22]:
df.head(20)

Unnamed: 0,repo,language,original,stemmed,lemmatized,clean
0,freeCodeCamp/freeCodeCamp,JavaScript,\n\n\n\n\n\nfreeCodeCamp.org's open-source cod...,freecodecamporg' opensourc codebas and curricu...,freecodecamporg's opensource codebase and curr...,freecodecamporg's opensource codebase curricul...
1,996icu/996.ICU,Rust,\n996.ICU\nPlease note that there exists NO ot...,996icu pleas note that there exist no other of...,996icu please note that there exists no other ...,996icu please note exists official account app...
2,vuejs/vue,JavaScript,\n\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue...,support vuej vuej is an mitlicens open sourc p...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...
3,facebook/react,JavaScript,\nReact · \nReact is a JavaScript library f...,react react is a javascript librari for build ...,react react is a javascript library for buildi...,react react javascript library building user i...
4,tensorflow/tensorflow,C++,\n\n\n\n\n\n\nDocumentation\n\n\n\n\n\n\n\n\nT...,document tensorflow is an endtoend open sourc ...,documentation tensorflow is an endtoend open s...,documentation tensorflow endtoend open source ...
5,twbs/bootstrap,JavaScript,"\n\n\n\n\n\nBootstrap\n\n Sleek, intuitive, a...",bootstrap sleek intuit and power frontend fram...,bootstrap sleek intuitive and powerful fronten...,bootstrap sleek intuitive powerful frontend fr...
6,EbookFoundation/free-programming-books,,\nThis page is available as an easy-to-read we...,thi page is avail as an easytoread websit at h...,this page is available a an easytoread website...,page available easytoread website httpsebookfo...
7,sindresorhus/awesome,,\n\n\n\n\n\n\n\nMy open source work is support...,my open sourc work is support by the commun sp...,my open source work is supported by the commun...,open source work supported community special t...
8,getify/You-Dont-Know-JS,,\nYou Don't Know JS Yet (book series) - 2nd Ed...,you don't know js yet book seri 2nd edit thi i...,you don't know j yet book series 2nd edition t...,know js yet book series 2nd edition series boo...
9,ohmyzsh/ohmyzsh,Shell,"\n\n\n\n\n\nOh My Zsh is an open source, commu...",oh my zsh is an open sourc communitydriven fra...,oh my zsh is an open source communitydriven fr...,oh zsh open source communitydriven framework m...
