In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from nltk.corpus import stopwords
import re
from tqdm import tqdm

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [None]:
LOAD_RAW_DATASET = False
tqdm.pandas(desc='')
DIR = './data/medium_articles/'

In [None]:
if LOAD_RAW_DATASET: dataframe = pd.read_csv(DIR + 'medium_articles.csv')
else: dataframe = pd.read_csv(DIR + 'medium_articles_cleaned.csv')

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
eng_stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    
    text = re.sub('\'', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.split()
    text = [stemmer.stem(lemmatizer.lemmatize(word)) for word in text]
    text = [word for word in text if word not in eng_stopwords]

    text = ' '.join(text).split('\n')

    return ' '.join(text)

def process_tag(raw_tags: str):
    raw_tags = raw_tags.replace('\'', '').strip('[]').split(',')
    for i in range(len(raw_tags)):
        raw_tags[i] = raw_tags[i].strip()
    return raw_tags

def remove_rare_tags(tags: list, tags_list):
    tags = [tag for tag in tags if tag in tags_list]
    return tags

In [None]:
dataframe.drop(['title', 'timestamp', 'authors', 'category'], axis=1, inplace=True)
dataframe = dataframe[dataframe['text'].isna() == False]
if LOAD_RAW_DATASET: dataframe['text'] = dataframe['text'].progress_apply(clean_text)
dataframe['tags'] = dataframe['tags'].progress_apply(process_tag)
dataframe.head(5)

In [None]:
tags_dict = {}
tag_occurence_treshold = 200
for tags in dataframe['tags']:
    for tag in tags:
        if tag not in tags_dict:
            tags_dict[tag] = 1
        else:
            tags_dict[tag] += 1

tags = [key for key, value in tags_dict.items() if value > tag_occurence_treshold]
dataframe['tags'] = dataframe['tags'].progress_apply(lambda x: remove_rare_tags(x, tags))

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(dataframe['tags'])
y = mlb.transform(dataframe['tags'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dataframe['text'], y, test_size=0.2, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10_000)
x_train_vectorized = tfidf_vectorizer.fit_transform(x_train)
x_test_vectorized = tfidf_vectorizer.transform(x_test)

In [None]:
print(x_train_vectorized.shape)
print(y_train.shape)

In [None]:
classifier = OneVsRestClassifier(LogisticRegression(max_iter=3000))
classifier.fit(x_train_vectorized, y_train)

<a href="https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter">max_iter=3000</a>

In [None]:
print(f'Train score: {classifier.score(x_train_vectorized, y_train): .2%}')
print(f'Test score: {classifier.score(x_test_vectorized, y_test): .2%}')