# Spacy Cleaner

Proposed first model.

Since we would like to use other metadata for model improvement, SpaCy will not suffice.

When adding metadata, a PyTorch architecture will be built.

In [1]:
import pandas as pd
import json

In [2]:
data = pd.read_csv('data/labeled_data.csv', index_col = 0)
data = data[['readme_content', 'Purpose']]

In [3]:
data = data[data['readme_content'] != 'Missing']

In [4]:
data.head(3)

Unnamed: 0,readme_content,Purpose
0,"b'<h1 align=""left"">\n <br>\n <a href=""http:/...",Educational; Software Development; Miscellaneo...
1,"b""[![Python 3.6](https://img.shields.io/badge/...",Analysis
2,b'<!-- badges: start -->\n# [geoChronR](https:...,Software development


In [5]:
# Misc
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous - Informational link to database', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous - Article referencing database links', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous - Miscellaneous - Scraping database registries', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous; Miscellaneous - Article referencing database links', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous - Informational Link', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous - Scraping database registries', 'Miscellaneous')

data['Purpose'] = data['Purpose'].str.replace('Miscellaneous; Miscellaneous', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous - Articles referencing database links; Miscellaneous - Scraping open science publication databases', 'Miscellaneous')

data['Purpose'] = data['Purpose'].str.replace('Miscellaneous', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous - Scraping database registries', 'Miscellaneous')

data['Purpose'] = data['Purpose'].str.replace('Miscellaneous - Articles referencing database links', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous ', 'Miscellaneous')
data['Purpose'] = data['Purpose'].str.replace('Miscellaneous- Article with database linkts', 'Miscellaneous')



# Software Dev
data['Purpose'] = data['Purpose'].str.replace('Software development', 'Software Development')
data['Purpose'] = data['Purpose'].str.replace('Software Development ', 'Software Development')
data['Purpose'] = data['Purpose'].str.replace('Software Development ', 'Software Development')
data['Purpose'] = data['Purpose'].str.replace('Storage; Software Development  ', 'Software Development')
data['Purpose'] = data['Purpose'].str.replace('Software Development; Storage', 'Software Development')
data['Purpose'] = data['Purpose'].str.replace('Storage; Software Development', 'Software Development')
data['Purpose'] = data['Purpose'].str.replace('Software Development; Miscellanous - Scraping database registries', 'Software Development')
data['Purpose'] = data['Purpose'].str.replace('Software Development; Miscellaneous', 'Software Development')

# Analysis
data['Purpose'] = data['Purpose'].str.replace('Analysis; Storage', 'Analysis')
data['Purpose'] = data['Purpose'].str.replace('Analysis; Miscellaneous- Articles referencing databases', 'Analysis')
data['Purpose'] = data['Purpose'].str.replace('Analysis; Educational; Miscellaneous', 'Analysis')
data['Purpose'] = data['Purpose'].str.replace('Analysis; Educational', 'Analysis')
data['Purpose'] = data['Purpose'].str.replace('Analysis; Software Development', 'Analysis')
data['Purpose'] = data['Purpose'].str.replace('Analysis ', 'Analysis')
data['Purpose'] = data['Purpose'].str.replace('Analysis; Educational', 'Analysis')

# Educational
data['Purpose'] = data['Purpose'].str.replace('Educational, Software Development', 'Educational')
data['Purpose'] = data['Purpose'].str.replace('Educational; Miscellaneous', 'Educational')
data['Purpose'] = data['Purpose'].str.replace('Educational', 'Educational')
data['Purpose'] = data['Purpose'].str.replace('Educational; Miscellaneous- Informational', 'Educational')
data['Purpose'] = data['Purpose'].str.replace('Educational; Software Development', 'Educational')
data['Purpose'] = data['Purpose'].str.replace('Educational ', 'Educational')
data['Purpose'] = data['Purpose'].str.replace('Educational- Informational', 'Educational')


In [6]:
data['Purpose'].unique()

array(['Educational', 'Analysis', 'Software Development', 'Miscellaneous',
       "Can't categorize/not enough information", 'Storage'], dtype=object)

In [7]:
data['Purpose'].value_counts().sort_values()

Storage                                     1
Can't categorize/not enough information     6
Educational                                14
Analysis                                   15
Software Development                       32
Miscellaneous                              43
Name: Purpose, dtype: int64

In [8]:
data = data[data['Purpose'] != 'Storage']
data = data[data['Purpose'] != "Can't categorize/not enough information"]
data = data.reset_index(drop = True)

In [9]:
data

Unnamed: 0,readme_content,Purpose
0,"b'<h1 align=""left"">\n <br>\n <a href=""http:/...",Educational
1,"b""[![Python 3.6](https://img.shields.io/badge/...",Analysis
2,b'<!-- badges: start -->\n# [geoChronR](https:...,Software Development
3,b'# README for scripts and data associated wit...,Analysis
4,b'# Integration Tests for DesignSafe\r\n### Th...,Software Development
...,...,...
99,b'# nlp_ranking\n\nRanking universities based ...,Miscellaneous
100,b'# Grond\n\nA probabilistic earthquake source...,Software Development
101,b'## stat133-fall-2015\n\nThis repository hold...,Educational
102,b'# Seismic Sound Downloader\n\nWebSite(En): h...,Software Development


In [10]:
data['Purpose'].value_counts()

Miscellaneous           43
Software Development    32
Analysis                15
Educational             14
Name: Purpose, dtype: int64

## Split Dataset

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = data['readme_content']
y = data['Purpose']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Create SpaCy Tokenizer

In [13]:
import spacy
import en_core_web_lg
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [14]:
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

# Defining a Custom Transformer

To further clean our text data, we’ll also want to create a custom transformer for removing initial and end spaces and converting text into lower case. Here, we will create a custom predictors class wich inherits the TransformerMixin class. This class overrides the transform, fit and get_parrams methods. We’ll also create a clean_text() function that removes spaces and converts text into lowercase.

In [15]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

# Vectorization Feature Engineering (BOW or TF-IDF)

In [16]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [17]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)
tfidf_vector

TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7fdb23408dd0>)

# Classifier Construction

# Training and Parameter Tuning

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:

class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [20]:
tree = DecisionTreeClassifier()

# Create pipeline using Bag of Words
pipe = Pipeline([('cleaner', predictors()),
                 ('vectorizer', tfidf_vector),
                 ('clf', ClfSwitcher())])

In [21]:
param_grid = [
              {
               'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
               #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
               #'tfidf__stop_words': ['english', None],
               'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
               'clf__estimator__max_iter': [50, 80],
               'clf__estimator__tol': [1e-4],
               'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
               },
              {
               'clf__estimator': [MultinomialNB()],
               #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
               #'tfidf__stop_words': [None],
               'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
               },
              {
               'clf__estimator': [DecisionTreeClassifier()],
               'clf__estimator__max_depth': list(range(1,20,2))}
               ]

In [22]:
#param_grid = {'classifier__max_depth': list(range(1,20,2))}

In [23]:
grid_search = GridSearchCV(pipe, param_grid, cv = 5, verbose=1, return_train_score=True)

In [24]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 31 candidates, totalling 155 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 155 out of 155 | elapsed:  1.6min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cleaner',
                                        <__main__.predictors object at 0x7fdb26b15090>),
                                       ('vectorizer',
                                        TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7fdb23408dd0>)),
                                       ('clf', ClfSwitcher())]),
             param_grid=[{'clf__estimator': [SGDClassifier(loss='modified_huber',
                                                           max_iter=80,
                                                           tol=0.0001)],
                          'clf__estimator__loss': ['hinge', 'log',
                                                   'modified_huber'],
                          'clf__estimator__max_iter': [50, 80],
                          'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
                          'clf__estimator__tol': [0.0001]},
                         {'clf__estimat

In [25]:
grid_search.best_score_

0.5676470588235294

In [26]:
#pd.DataFrame(grid_search).mean()

In [27]:
best_estimator = grid_search.best_estimator_

In [28]:
best_estimator.predict(X_test)

array(['Educational', 'Miscellaneous', 'Miscellaneous',
       'Software Development', 'Analysis', 'Miscellaneous',
       'Software Development', 'Miscellaneous', 'Miscellaneous',
       'Software Development', 'Miscellaneous', 'Miscellaneous',
       'Miscellaneous', 'Educational', 'Miscellaneous',
       'Software Development', 'Analysis', 'Miscellaneous',
       'Software Development', 'Software Development', 'Miscellaneous'],
      dtype='<U20')

In [29]:
from sklearn import metrics
# Predicting with a test dataset
predicted = best_estimator.predict(X_test)
train_predicted = best_estimator.predict(X_train)

# Model Accuracy
print("Best Estimator Test Accuracy:",metrics.accuracy_score(y_test, predicted))

Best Estimator Test Accuracy: 0.6666666666666666


In [30]:
print("Best Estimator Train Accuracy:",metrics.accuracy_score(y_train, train_predicted))

Best Estimator Train Accuracy: 1.0


Painfully overfitting....