In [19]:
import pandas as pd
import tensorflow as tf
import numpy as np
from nltk.corpus import stopwords
import re
from tqdm import tqdm
import pickle

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

In [21]:
SPEED_RUN = True
tqdm.pandas(desc='')
DIR = './data/medium_articles/'

In [45]:
main_dataframe = pd.read_csv(DIR + 'medium_articles.csv')
if SPEED_RUN: text_dataframe = pd.read_csv(DIR + 'text_cleaned.csv')

main_dataframe.head(5)

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [23]:
def process_tag(raw_tags: str):
    raw_tags = raw_tags.replace('\'', '').strip('[]').split(',')
    for i in range(len(raw_tags)):
        raw_tags[i] = raw_tags[i].strip()
    return raw_tags

main_dataframe['tags'] = main_dataframe['tags'].progress_apply(process_tag)

100%|██████████| 192368/192368 [00:01<00:00, 125288.28it/s]


In [24]:
def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    eng_stopwords = set(stopwords.words('english'))
    text = text.lower()
    
    text = re.sub('\'', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.split()
    text = [stemmer.stem(lemmatizer.lemmatize(word)) for word in text]
    text = [word for word in text if word not in eng_stopwords]

    text = ' '.join(text).split('\n')

    return ' '.join(text)

In [46]:
text_dataframe.head(5)

Unnamed: 0,text
0,photo josh riemer unsplash merri christma happ...
1,brain coronaviru guid curiou troubl impact pan...
2,mind nose smell train chang brain six week whi...
3,passion synergi scienc technolog provid better...
4,heard phinea gage railroad worker surviv explo...


In [25]:
if SPEED_RUN: 
    x = text_dataframe['text']
    del text_dataframe
else: x = main_dataframe['text'].progress_apply(clean_text)

x = x[x.isna() == False]

In [44]:
print(main_dataframe['text'][0][:100])
print('=' * 100)
print(x[0][:100])

Photo by Josh Riemer on Unsplash

Merry Christmas and Happy Holidays, everyone!

We just wanted ever
photo josh riemer unsplash merri christma happi holiday everyon want everyon know much appreci every


In [26]:
y = main_dataframe['tags'].loc[x.index]

tags_dict = {}
tag_occurence_treshold = 200
for tag_list in y:
    for tag in tag_list:
        if tag not in tags_dict:
            tags_dict[tag] = 1
        else:
            tags_dict[tag] += 1

def remove_rare_tags(tags: list, tags_list):
    tags = [tag for tag in tags if tag in tags_list]
    return tags

tags = [key for key, value in tags_dict.items() if value > tag_occurence_treshold]
y = y.progress_apply(lambda x: remove_rare_tags(x, tags))
y = y[y.str.len() > 0]

x = x.loc[y.index]

100%|██████████| 192367/192367 [00:05<00:00, 36255.94it/s]


In [47]:
len(tags)

1215

In [27]:
x.shape, y.shape

((186665,), (186665,))

In [28]:
mlb = MultiLabelBinarizer()
mlb.fit(y)
y = mlb.transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [29]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10_000)
x_train_vectorized = tfidf_vectorizer.fit_transform(x_train)
x_test_vectorized = tfidf_vectorizer.transform(x_test)

In [48]:
x_train_vectorized.shape, x_test_vectorized.shape

((149332, 10000), (37333, 10000))

In [None]:
if SPEED_RUN:
    with open(DIR + 'classifier.pkl', 'rb') as f:
        classifier = pickle.load(f)
else: 
    classifier = OneVsRestClassifier(LogisticRegression(max_iter=3000))
    classifier.fit(x_train_vectorized, y_train)

    with open(DIR + 'classifier.pkl', 'wb') as f:
        pickle.dump(classifier, f)

<a href="https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter">max_iter=3000</a>

In [31]:
print(f'Train score: {classifier.score(x_train_vectorized, y_train): .2%}')
print(f'Test score: {classifier.score(x_test_vectorized, y_test): .2%}')

Train score:  3.52%
Test score:  2.84%


In [32]:
predictions = classifier.predict(x_test_vectorized)
f1_score(y_test, predictions, average='micro'), accuracy_score(y_test, predictions)

(0.2521894755466033, 0.028393110652773685)

In [33]:
def predict(text):
    text = clean_text(text)
    text = tfidf_vectorizer.transform([text])
    pred = classifier.predict(text)
    return list(mlb.inverse_transform(pred)[0])

In [34]:
for i in range(100):
    k = x_test.sample(1).index[0]
    print("Article: " + main_dataframe['title'][k])
    print("Predicted Tags: " + str(predict(x_test[k])))
    print("Actual Tags: " + str(main_dataframe['tags'][k]))
    print()

Article: The Best Remaining Veteran NBA Free Agents
Predicted Tags: ['Basketball', 'NBA']
Actual Tags: ['Basketball', 'Sports', 'Pop Culture', 'Culture', 'NBA']

Article: Which Line Would You Pull To Catch The Fish? Our Deep Learning Networks Don’t Know.
Predicted Tags: ['Artificial Intelligence', 'Deep Learning']
Actual Tags: ['Machine Learning', 'AI', 'Machine Vision', 'Deep Learning', 'Towards Data Science']

Article: Why You Should Write About Pain and Pleasure
Predicted Tags: []
Actual Tags: ['Life Lessons', 'Self', 'Advice', 'Society', 'Writing']

Article: Moving Freetrade into Figma
Predicted Tags: ['Figma']
Actual Tags: ['Figma', 'Product Design', 'Design Process', 'Design']

Article: The Intrepid Report Issue #37
Predicted Tags: ['Bitcoin', 'Blockchain', 'Cryptocurrency']
Actual Tags: ['Newsletter', 'Technology', 'News', 'Blockchain', 'Bitcoin']

Article: Interview with Sherry James, Author of “After The Suicide: Leading With Love And Light”
Predicted Tags: []
Actual Tags: ['S