In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from nltk.corpus import stopwords
import re
from tqdm import tqdm
import pickle

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
SPEED_RUN = True
tqdm.pandas(desc='')
DIR = './data/medium_articles/'

In [4]:
main_dataframe = pd.read_csv(DIR + 'medium_articles.csv')
if SPEED_RUN: text_dataframe = pd.read_csv(DIR + 'text_cleaned.csv')

main_dataframe.head(2)

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."


In [5]:
def process_tag(raw_tags: str):
    raw_tags = raw_tags.replace('\'', '').strip('[]').split(',')
    for i in range(len(raw_tags)):
        raw_tags[i] = raw_tags[i].strip()
    return raw_tags

main_dataframe['tags'] = main_dataframe['tags'].progress_apply(process_tag)

100%|██████████| 192368/192368 [00:00<00:00, 438545.64it/s]


In [6]:
def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    eng_stopwords = set(stopwords.words('english'))
    text = text.lower()
    
    text = re.sub('\'', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.split()
    text = [stemmer.stem(lemmatizer.lemmatize(word)) for word in text]
    text = [word for word in text if word not in eng_stopwords]

    text = ' '.join(text).split('\n')

    return ' '.join(text)

In [7]:
if SPEED_RUN: 
    x = text_dataframe['text']
    del text_dataframe
else: x = main_dataframe['text'].progress_apply(clean_text)

x = x[x.isna() == False]

In [8]:
y = main_dataframe['tags'].loc[x.index]

tags_dict = {}
tag_occurence_treshold = 200
for tag_list in y:
    for tag in tag_list:
        if tag not in tags_dict:
            tags_dict[tag] = 1
        else:
            tags_dict[tag] += 1

def remove_rare_tags(tags: list, tags_list):
    tags = [tag for tag in tags if tag in tags_list]
    return tags

tags = [key for key, value in tags_dict.items() if value > tag_occurence_treshold]
y = y.progress_apply(lambda x: remove_rare_tags(x, tags))
y = y[y.str.len() > 0]

x = x.loc[y.index]

100%|██████████| 192367/192367 [00:04<00:00, 39593.57it/s]


In [9]:
x.shape, y.shape

((186665,), (186665,))

In [10]:
mlb = MultiLabelBinarizer()
mlb.fit(y)
y = mlb.transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10_000)
x_train_vectorized = tfidf_vectorizer.fit_transform(x_train)
x_test_vectorized = tfidf_vectorizer.transform(x_test)

In [None]:
if SPEED_RUN:
    with open(DIR + 'classifier.pkl', 'rb') as f:
        classifier = pickle.load(f)
else: 
    classifier = OneVsRestClassifier(LogisticRegression(max_iter=3000))
    classifier.fit(x_train_vectorized, y_train)

    with open(DIR + 'classifier.pkl', 'wb') as f:
        pickle.dump(classifier, f)

<a href="https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter">max_iter=3000</a>

In [13]:
print(f'Train score: {classifier.score(x_train_vectorized, y_train): .2%}')
print(f'Test score: {classifier.score(x_test_vectorized, y_test): .2%}')

Train score:  3.52%
Test score:  2.84%


In [14]:
predictions = classifier.predict(x_test_vectorized)
f1_score(y_test, predictions, average='micro'), accuracy_score(y_test, predictions)

(0.2521894755466033, 0.028393110652773685)

In [29]:
def predict(text):
    text = clean_text(text)
    text = tfidf_vectorizer.transform([text])
    pred = classifier.predict(text)
    return list(mlb.inverse_transform(pred)[0])

In [31]:
for i in range(100):
    k = x_test.sample(1).index[0]
    print("\033[1m" + "Article: " + "\033[0m" + main_dataframe['title'][k])
    print("\033[1m" + "Predicted Tags: " + "\033[0m"+ str(predict(x_test[k])))
    print("\033[1m" + "Actual Tags: " + "\033[0m" + str(main_dataframe['tags'][k]))
    print()

[1mArticle: [0m+40.16% growth: How to Buy Insights Network (INSTAR) — A Step by Step Guide
[1mPredicted Tags: [0m['Bitcoin', 'Crypto', 'Cryptocurrency']
[1mActual Tags: [0m['Instar', 'Crypto', 'Cryptocurrencies', 'Insights', 'Network']

[1mArticle: [0mHow to share an open job on LinkedIn
[1mPredicted Tags: [0m['LinkedIn']
[1mActual Tags: [0m['Hiring', 'Recruiting', 'Employer Branding', 'Startup', 'Recruitment Marketing']

[1mArticle: [0mHorrible Missionaries Listen To Rufus Wainwright
[1mPredicted Tags: [0m[]
[1mActual Tags: [0m['Missionary Life', 'Kenya', 'Life Lessons', 'Rufus Wainwright', 'Evangelism']

[1mArticle: [0mLord Dubs
[1mPredicted Tags: [0m['Immigration']
[1mActual Tags: [0m['Children', 'Border', 'Refugees', 'Migration', 'Immigration']

[1mArticle: [0mHello Ognitio!
[1mPredicted Tags: [0m[]
[1mActual Tags: [0m['Data Science', 'Cloud Computing', 'Enterprise Software']

[1mArticle: [0mHOW TO GET SALES FOR CBD
[1mPredicted Tags: [0m['Cannabis