In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

%load_ext autoreload
%autoreload 2

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
PCA on top?

In [None]:
from crawler import async_cache_pages, urls_list
import random
random.seed(0)

In [None]:
folders = ["general2"]

positives, negatives = urls_list(folders)
async_cache_pages(positives + negatives)

In [None]:
# Same number of negative and positive examples
urls = positives + random.sample(negatives, len(positives))
labels = [True] * len(positives) + [False] * len(positives)

In [None]:
from features import construct_text_df

In [None]:
df = construct_text_df(urls, labels)

In [None]:
df.sample(5, random_state=1)

In [None]:
X = df["visible_text"]
y = df.label#.replace([False, True], [0, 1])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [None]:
X_counts = count_vect.fit_transform(X)

In [None]:
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [None]:
feats = count_vect.get_feature_names()
feats[10000:10000 + 10]

In [None]:
top_feats_in_doc(X_tfidf, feats, 0, 10)

In [None]:
top_feats_in_doc(X_tfidf, feats, 1, 10)

In [None]:
top_feats_in_doc(X_tfidf, feats, 2, 10)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

pipeline.fit(X, y)

In [None]:
docs_new = ["course submit", "education learning", "python coursera", "submit button", "medical hospital"]
predicted = pipeline.predict(docs_new)

In [None]:
for doc, category in zip(docs_new, predicted):
    print(doc, "==>", category)

In [None]:
from features import analyse_classification

In [None]:
pipeline2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

analyse_classification(X, y, pipeline2)

## Taking all training data (more negatives than positives)

precision    recall  f1-score   support

False       0.78      0.98      0.87        50

True       0.92      0.44      0.59        25

avg / total       0.82      0.80      0.78        75

[[49  1]

 [14 11]]

In [None]:
from sklearn.linear_model import LogisticRegression
pipeline3 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),
])

analyse_classification(X, y, pipeline3)

In [None]:
pipeline3.decision_function(X)

Problem ? 0.84 quite high. Overfitting because 6-7 pages in the same platform ?

In [None]:
new_urls_true = [
    "https://www.wallstreetprep.com/self-study-programs/oil-and-gas-modeling/",
    "https://www.wallstreetprep.com/self-study-programs/restructuring-modeling/",
    "https://www.wallstreetprep.com/self-study-programs/bank-and-fig-modeling/",
    "https://www.wallstreetprep.com/self-study-programs/adv-lbo-modeling/",
    "https://www.educba.com/course/online-investment-banking-training-courses/",
    "https://www.creativelive.com/courses/find-your-niche-and-build-your-family-photography-business-julia-kelleher",
    "https://www.creativelive.com/courses/lightroom-cc-photo-editing-the-complete-guide-ben-willmore",
    "https://www.creativelive.com/courses/portrait-compositing-from-start-to-finish-matt-kloskowski?via=class-list-collection_3",
    "https://onlinecoursemasters.com/",
    "https://ocw.mit.edu/courses/mathematics/18-01-single-variable-calculus-fall-2006/index.htm",
    "http://tutorial.math.lamar.edu/Classes/CalcI/CalcI.aspx",
    "https://www.khanacademy.org/math/calculus-home",
    "https://www.coursera.org/learn/calculus1",
    "https://www.simplilearn.com/blockchain-certification-training",
]

new_urls_false = [
    "https://twitter.com/?lang=en",
    "http://iamafoodblog.com/",
    "http://iamafoodblog.com/category/recipes/",
    "https://keepvid.com/sites/download-youtube-video.html",
    "https://en.wikipedia.org/wiki/Massive_open_online_course",
    "https://github.com/",
    "https://github.com/features",
    "https://news.ycombinator.com/",
    "https://www.schneems.com/2017/11/14/wtf-is-a-source-map/",
    "https://medium.freecodecamp.org/using-svg-as-placeholders-more-image-loading-techniques-bed1b810ab2c"
]


async_cache_pages(new_urls_true + new_urls_false)

In [None]:
# Not really using these labels anyway
new_labels_true = [True] * len(new_urls_true)
new_labels_false = [False] * len(new_urls_false)

df_true = construct_text_df(new_urls_true, new_labels_true)
df_false = construct_text_df(new_urls_false, new_labels_false)

X_new_true = df_true["visible_text"]
X_new_false = df_false["visible_text"]

In [None]:
pipeline3.predict(X_new_true)

In [None]:
pipeline3.decision_function(X_new_true)

In [None]:
pipeline3.predict(X_new_false)

In [None]:
pipeline3.decision_function(X_new_false)

Yes, works well because of the duplication in the training set, but we see that there are no false positive, and the false negatives have a value close to zero in the decision function, meaning it is "unsure".

In [None]:
folders = ["general3"]

positives, negatives = urls_list(folders)
async_cache_pages(positives + negatives)

In [None]:
# Same number of negative and positive examples
urls = positives + random.sample(negatives, len(positives))
labels = [True] * len(positives) + [False] * len(positives)

In [None]:
df = construct_text_df(urls, labels)
X = df["visible_text"]
y = df.label

In [None]:
# Multinomial NB
analyse_classification(X, y, pipeline2)

In [None]:
# Logistic regression
analyse_classification(X, y, pipeline3)

In [None]:
from sklearn.externals import joblib
import os

In [None]:
joblib.dump(pipeline3, os.path.join('saved', 'models', 'log_reg_pipeline_general3.pkl'))