In [47]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [1]:
import pandas as pd
from prepare_nlp import basic_clean, tokenize, lemmatize, remove_stopwords
from acquire import get_data

In [38]:
def basic_clean_keep_code(string):
    '''
    Takes in a string, makes all characters lowercase, normalizes all characters, and removes unnnecessary special characters
    import re
    import unicodedata
    '''
    # Remove line breaks
    string = re.sub(r'\n', ' ', string)
    
    # Remove the urls
    string = re.sub(r'https?://[^\s]+', '', string)
    
    # lowercase all words
    lowered = string.lower()

    # normalize unicode characters using lowered
    normalized = unicodedata.normalize('NFKD', lowered).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # replacing unnecessary characters from normalized
    cleaned = re.sub(r"[^a-z0-9'\s]", '', normalized)
    
    return cleaned

In [2]:
df = get_data()
df.head()

Unnamed: 0,repo,language,readme_contents
0,BeatSwitch/lock,PHP,# Lock - Acl for PHP 5.4+\n\n[![Build Status](...
1,Boostport/kubernetes-vault,Go,# No Longer Being Maintained.\n\nThe integrati...
2,Hitomis/SpinMenu,Java,# SpinMenu\n[![Android Arsenal](https://img.sh...
3,IBDecodable/IBLinter,Swift,# IBLinter\n[![Build Status](https://travis-ci...
4,KieranLafferty/KLNoteViewController,Objective-C,"KLNoteViewController\n=======\n\n<img src=""htt..."


In [12]:
df['readme_contents'].apply(lambda x: basic_clean(x))

0      lock  acl for php 54\n\nbuild statushttpsimgs...
1      no longer being maintained\n\nthe integration...
2      spinmenu\nandroid arsenalhttpsimgshieldsiobad...
3      iblinter\nbuild statushttpstravisciorgibdecod...
4     klnoteviewcontroller\n\n\nimg srchttpsrawgithu...
                            ...                        
95     reduxaxiosmiddleware\n\nnpm versionhttpsbadge...
96     emacs starter kit\n\nversion 3 of the emacs s...
97    simulacrum\n\n\ncontinuous integrationhttpsgit...
98    httpolice\n\n\n status\n image httpsimgshields...
99    div aligncenter\nh1dstph1\n\ndstphttpsgithubco...
Name: readme_contents, Length: 100, dtype: object

In [14]:
# getting a string to play with
x = df['readme_contents'][0]

In [15]:
# examine and witing out cleaning steps
x



#### Takeaways:
* replace all \n with ' '
* table of contents
* areas of interest: **___** for highlighted text, ## for headers, `highlighted text`, ```code block```, [topic](linktoinfoontopic)


#### Actions:
* Column1:
    * First, remove all \n and replace wih spaces
    * Remove all websites
    * Remove all special characters
* Column 2
    * Remove all \n and replace wih spaces
    * Remove all websites
    * Remove all code blocks
    * Remove all special characters

In [17]:
# beginning column 1 actions
# Remove line breaks
x = re.sub(r'\n', ' ', x)

In [28]:
# finding the websites
regexp = r'https?://[^\s]+'
subject = x
# re.findall(regexp, subject)

# replacing the websites
x = re.sub(regexp, ' ', x)

In [39]:
basic_clean_keep_code(x)



In [40]:
# Testing the basica cleaning applied
df['basic_clean_1'] = df['readme_contents'].apply(lambda x: basic_clean_keep_code(x))

In [41]:
# getting tokenized form

In [43]:
def tokenize(string):
    '''
    Takes in a string and tokenizes the string
    Modules:
        from nltk.tokenize.toktok import ToktokTokenizer
    '''
    # initialize tokenizers
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # tokenize string and store in tokenized
    tokenized = tokenizer.tokenize(string, return_str=True)
    
    return tokenized

In [44]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    takes in a string and removes stopwords using the stopwords
    Modules:
         from nltk.corpus import stopwords
    '''
    # establish stop word list
    stop_word_list = stopwords.words('english')
    
    # if there are any words in the kwarg
    if bool(extra_words) == True:

        # add them to the stop_word_list
        stop_word_list = stop_word_list + extra_words

    # if there are any words in the kwarg
    if bool(exclude_words) == True:

        # remove them from the stop word list
        stop_word_list = [word for word in stop_word_list if word not in exclude_words]
        
    # getting a list of words from string argument that are not in the list of stop words (removing the stopwords)
    filtered = [word for word in string.split() if word not in stop_word_list]
    
    # rejoin all the words in the lsit with a space to reform string
    string_without_stopwords = ' '.join(filtered)
    
    # exit and return the string
    return string_without_stopwords

In [45]:
def cleaned_with_code_included(x):
    '''
    Takes in a string literal and performs cleaning, tokenizing, and removes the stop words
    
    '''
    # runs a basic clean
    x = basic_clean_keep_code(x)
    
    # tokenizes the words
    x = tokenize(x)
    
    # removes the stop words
    x = remove_stopwords(x)
    
    # returns string with all cleaning steps performed
    return x

In [48]:
# testing
df['readme_contents'].apply(lambda x: cleaned_with_code_included(x))

0     lock acl php 54 build status code climate test...
1     longer maintained integration vault kubernetes...
2     spinmenu android arsenal fragment preview img ...
3     iblinter build status swift 50 linter tool nor...
4     klnoteviewcontroller img src width50 control o...
                            ...                        
95    reduxaxiosmiddleware npm version redux middlew...
96    emacs starter kit version 3 emacs starter kit ...
97    simulacrum continuous integration maven centra...
98    httpolice status image target image target ima...
99    div aligncenter h1dstph1 dstp run common netwo...
Name: readme_contents, Length: 100, dtype: object

In [50]:
# droppinf initial
df = df.drop('basic_clean_1', axis=1)

In [51]:
# initial cleaning completed
df['basic_clean_with_code'] = df['readme_contents'].apply(lambda x: cleaned_with_code_included(x))

In [54]:
def lemmatize(string):
    '''
    Takes in a string and returns it with all words in lemmatized form
    Modules:
        import nltk
    '''
    # initializing lematizing object
    wnl = nltk.stem.WordNetLemmatizer()

    # getting a list of root words from each word in the split string
    lemmas = [wnl.lemmatize(word) for word in string.split()]

    # rejoining the list of root words to form a lemmatized corpus
    lemmatized = ' '.join(lemmas)
    
    # exit and return lemmatized info
    return lemmatized

In [58]:
df['basic_clean_with_code'].apply(lambda x: lemmatize(x))

0     lock acl php 54 build status code climate test...
1     longer maintained integration vault kubernetes...
2     spinmenu android arsenal fragment preview img ...
3     iblinter build status swift 50 linter tool nor...
4     klnoteviewcontroller img src width50 control o...
                            ...                        
95    reduxaxiosmiddleware npm version redux middlew...
96    emacs starter kit version 3 emacs starter kit ...
97    simulacrum continuous integration maven centra...
98    httpolice status image target image target ima...
99    div aligncenter h1dstph1 dstp run common netwo...
Name: basic_clean_with_code, Length: 100, dtype: object

In [65]:
# getting practice data
x = df['basic_clean_with_code'][27][-500:]

In [66]:
# initilializing wnl
wnl = nltk.stem.WordNetLemmatizer()

for word in x.split():
    print('original:', word, '-- lemma:', wnl.lemmatize(word))

original: repoopenwhiskopenwhisk -- lemma: repoopenwhiskopenwhisk
original: microsoft -- lemma: microsoft
original: azure -- lemma: azure
original: functions -- lemma: function
original: progress -- lemma: progress
original: help -- lemma: help
original: wanted -- lemma: wanted
original: see -- lemma: see
original: azure -- lemma: azure
original: functions -- lemma: function
original: reference -- lemma: reference
original: example -- lemma: example
original: local -- lemma: local
original: testing -- lemma: testing
original: script -- lemma: script
original: bash -- lemma: bash
original: environment -- lemma: environment
original: variables -- lemma: variable
original: export -- lemma: export
original: azurestorageconnectionstring -- lemma: azurestorageconnectionstring
original: copy -- lemma: copy
original: azure -- lemma: azure
original: console -- lemma: console
original: export -- lemma: export
original: configfileconfigmsajson -- lemma: configfileconfigmsajson
original: node -- l

In [82]:
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [83]:
def spacy_string(string):
    '''
    Takes in a string and returns it with all words in spacy-lemmatization form form
    Modules:
        import spacy
    '''
    # initializing lematizing object
    nlp = spacy.load('en_core_web_sm')
    
    # getting lemmatized words
    string_stemmed = [word.lemma_ for word in nlp(string)]
    
    # rejoining words
    string_stemmed = ' '.join(string_stemmed)
    
    # exit and return lemmatized info
    return string_stemmed

In [85]:
# getting spacy stemming
df['spacy'] = df['basic_clean_with_code'].apply(lambda x: spacy_string(x))

In [86]:
# getting lemmatized text
df['lem'] = df['basic_clean_with_code'].apply(lambda x: lemmatize(x))

In [87]:
# checking
df.head()

Unnamed: 0,repo,language,readme_contents,basic_clean_with_code,spacy,lem
0,BeatSwitch/lock,PHP,# Lock - Acl for PHP 5.4+\n\n[![Build Status](...,lock acl php 54 build status code climate test...,lock acl php 54 build status code climate test...,lock acl php 54 build status code climate test...
1,Boostport/kubernetes-vault,Go,# No Longer Being Maintained.\n\nThe integrati...,longer maintained integration vault kubernetes...,long maintain integration vault kubernete grea...,longer maintained integration vault kubernetes...
2,Hitomis/SpinMenu,Java,# SpinMenu\n[![Android Arsenal](https://img.sh...,spinmenu android arsenal fragment preview img ...,spinmenu android arsenal fragment preview img ...,spinmenu android arsenal fragment preview img ...
3,IBDecodable/IBLinter,Swift,# IBLinter\n[![Build Status](https://travis-ci...,iblinter build status swift 50 linter tool nor...,iblinter build status swift 50 linter tool nor...,iblinter build status swift 50 linter tool nor...
4,KieranLafferty/KLNoteViewController,Objective-C,"KLNoteViewController\n=======\n\n<img src=""htt...",klnoteviewcontroller img src width50 control o...,klnoteviewcontroller img src width50 control o...,klnoteviewcontroller img src width50 control o...


In [None]:
# creating a basic_prepare function that does all of the above in one go


In [88]:
def basic_prepare(df):
    '''
    Takes in a df and adds columns with cleaned code
    '''
    
    # initial cleaning completed
    df['basic_clean_with_code'] = df['readme_contents'].apply(lambda x: cleaned_with_code_included(x))
    
    # getting spacy stemming
    df['spacy'] = df['basic_clean_with_code'].apply(lambda x: spacy_string(x))
    
    # getting lemmatized text
    df['lem'] = df['basic_clean_with_code'].apply(lambda x: lemmatize(x))
    
    return df

In [89]:
# testing the above function
df = get_data()

In [90]:
# success, not to create a prepare.py file
basic_prepare(df)

Unnamed: 0,repo,language,readme_contents,basic_clean_with_code,spacy,lem
0,BeatSwitch/lock,PHP,# Lock - Acl for PHP 5.4+\n\n[![Build Status](...,lock acl php 54 build status code climate test...,lock acl php 54 build status code climate test...,lock acl php 54 build status code climate test...
1,Boostport/kubernetes-vault,Go,# No Longer Being Maintained.\n\nThe integrati...,longer maintained integration vault kubernetes...,long maintain integration vault kubernete grea...,longer maintained integration vault kubernetes...
2,Hitomis/SpinMenu,Java,# SpinMenu\n[![Android Arsenal](https://img.sh...,spinmenu android arsenal fragment preview img ...,spinmenu android arsenal fragment preview img ...,spinmenu android arsenal fragment preview img ...
3,IBDecodable/IBLinter,Swift,# IBLinter\n[![Build Status](https://travis-ci...,iblinter build status swift 50 linter tool nor...,iblinter build status swift 50 linter tool nor...,iblinter build status swift 50 linter tool nor...
4,KieranLafferty/KLNoteViewController,Objective-C,"KLNoteViewController\n=======\n\n<img src=""htt...",klnoteviewcontroller img src width50 control o...,klnoteviewcontroller img src width50 control o...,klnoteviewcontroller img src width50 control o...
...,...,...,...,...,...,...
95,svrcekmichal/redux-axios-middleware,JavaScript,# redux-axios-middleware\n\n[![npm version](ht...,reduxaxiosmiddleware npm version redux middlew...,reduxaxiosmiddleware npm version redux middlew...,reduxaxiosmiddleware npm version redux middlew...
96,technomancy/emacs-starter-kit,,# Emacs Starter Kit\n\nVersion 3 of the Emacs ...,emacs starter kit version 3 emacs starter kit ...,emacs starter kit version 3 emac starter kit i...,emacs starter kit version 3 emacs starter kit ...
97,typelevel/simulacrum,Scala,simulacrum\n==========\n\n[![Continuous Integr...,simulacrum continuous integration maven centra...,simulacrum continuous integration maven centra...,simulacrum continuous integration maven centra...
98,vfaronov/httpolice,Python,HTTPolice\n=========\n\n.. status:\n.. image::...,httpolice status image target image target ima...,httpolice status image target image target ima...,httpolice status image target image target ima...


In [None]:
import pandas as pd
import re
import unicode
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import spacy


def basic_clean_keep_code(string):
    '''
    Takes in a string, makes all characters lowercase, normalizes all characters, and removes unnnecessary special characters
    import re
    import unicodedata
    '''
    # Remove line breaks
    string = re.sub(r'\n', ' ', string)
    
    # Remove the urls
    string = re.sub(r'https?://[^\s]+', '', string)
    
    # lowercase all words
    lowered = string.lower()

    # normalize unicode characters using lowered
    normalized = unicodedata.normalize('NFKD', lowered).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # replacing unnecessary characters from normalized
    cleaned = re.sub(r"[^a-z0-9'\s]", '', normalized)
    
    return cleaned

def tokenize(string):
    '''
    Takes in a string and tokenizes the string
    Modules:
        from nltk.tokenize.toktok import ToktokTokenizer
    '''
    # initialize tokenizers
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # tokenize string and store in tokenized
    tokenized = tokenizer.tokenize(string, return_str=True)
    
    return tokenized


def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    takes in a string and removes stopwords using the stopwords
    Modules:
         from nltk.corpus import stopwords
    '''
    # establish stop word list
    stop_word_list = stopwords.words('english')
    
    # if there are any words in the kwarg
    if bool(extra_words) == True:

        # add them to the stop_word_list
        stop_word_list = stop_word_list + extra_words

    # if there are any words in the kwarg
    if bool(exclude_words) == True:

        # remove them from the stop word list
        stop_word_list = [word for word in stop_word_list if word not in exclude_words]
        
    # getting a list of words from string argument that are not in the list of stop words (removing the stopwords)
    filtered = [word for word in string.split() if word not in stop_word_list]
    
    # rejoin all the words in the lsit with a space to reform string
    string_without_stopwords = ' '.join(filtered)
    
    # exit and return the string
    return string_without_stopwords

def cleaned_with_code_included(x):
    '''
    Takes in a string literal and performs cleaning, tokenizing, and removes the stop words
    
    '''
    # runs a basic clean
    x = basic_clean_keep_code(x)
    
    # tokenizes the words
    x = tokenize(x)
    
    # removes the stop words
    x = remove_stopwords(x)
    
    # returns string with all cleaning steps performed
    return x

def lemmatize(string):
    '''
    Takes in a string and returns it with all words in lemmatized form
    Modules:
        import nltk
    '''
    # initializing lematizing object
    wnl = nltk.stem.WordNetLemmatizer()

    # getting a list of root words from each word in the split string
    lemmas = [wnl.lemmatize(word) for word in string.split()]

    # rejoining the list of root words to form a lemmatized corpus
    lemmatized = ' '.join(lemmas)
    
    # exit and return lemmatized info
    return lemmatized

def spacy_string(string):
    '''
    Takes in a string and returns it with all words in spacy-lemmatization form form
    Modules:
        import spacy
    '''
    # initializing lematizing object
    nlp = spacy.load('en_core_web_sm')
    
    # getting lemmatized words
    string_stemmed = [word.lemma_ for word in nlp(string)]
    
    # rejoining words
    string_stemmed = ' '.join(string_stemmed)
    
    # exit and return lemmatized info
    return string_stemmed

def basic_prepare(df):
    '''
    Takes in a df and adds columns with cleaned code
    '''
    
    # initial cleaning completed
    df['basic_clean_with_code'] = df['readme_contents'].apply(lambda x: cleaned_with_code_included(x))
    
    # getting spacy stemming
    df['spacy'] = df['basic_clean_with_code'].apply(lambda x: spacy_string(x))
    
    # getting lemmatized text
    df['lem'] = df['basic_clean_with_code'].apply(lambda x: lemmatize(x))
    
    return df

In [1]:
from acquire import get_data
from prepare import basic_prepare

# get data
df = get_data()

# get prepared data
df = basic_prepare(df)

# check the gead
df.head()