In [27]:
import pandas as pd
import tensorflow as tf
import numpy as np
from nltk.corpus import stopwords
import re
from tqdm import tqdm

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [29]:
LOAD_RAW_DATASET = False
tqdm.pandas(desc='')
DIR = './data/medium_articles/'

In [30]:
if LOAD_RAW_DATASET: dataframe = pd.read_csv(DIR + 'medium_articles.csv')
else: dataframe = pd.read_csv(DIR + 'medium_articles_cleaned.csv')

In [31]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
eng_stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    
    text = re.sub('\'', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.split()
    text = [stemmer.stem(lemmatizer.lemmatize(word)) for word in text]
    text = [word for word in text if word not in eng_stopwords]

    text = ' '.join(text).split('\n')

    return ' '.join(text)

def process_tag(raw_tags: str):
    raw_tags = raw_tags.replace('\'', '').strip('[]').split(',')
    for i in range(len(raw_tags)):
        raw_tags[i] = raw_tags[i].strip()
    return raw_tags

def remove_rare_tags(tags: list, tags_list):
    tags = [tag for tag in tags if tag in tags_list]
    return tags

In [32]:
dataframe.drop(['title', 'timestamp', 'authors', 'category'], axis=1, inplace=True)
dataframe = dataframe[dataframe['text'].isna() == False]
if LOAD_RAW_DATASET: dataframe['text'] = dataframe['text'].progress_apply(clean_text)
dataframe['tags'] = dataframe['tags'].progress_apply(process_tag)
dataframe.head(5)

100%|██████████| 192367/192367 [00:00<00:00, 663057.65it/s]


Unnamed: 0,text,tags
0,photo josh riemer unsplash merri christma happ...,"[Mental Health, Health, Psychology, Science, N..."
1,brain coronaviru guid curiou troubl impact pan...,"[Mental Health, Coronavirus, Science, Psycholo..."
2,mind nose smell train chang brain six week whi...,"[Biotechnology, Neuroscience, Brain, Wellness,..."
3,passion synergi scienc technolog provid better...,"[Health, Neuroscience, Mental Health, Psycholo..."
4,heard phinea gage railroad worker surviv explo...,"[Brain, Health, Development, Psychology, Science]"


In [33]:
tags_dict = {}
tag_occurence_treshold = 200
for tags in dataframe['tags']:
    for tag in tags:
        if tag not in tags_dict:
            tags_dict[tag] = 1
        else:
            tags_dict[tag] += 1

tags = [key for key, value in tags_dict.items() if value > tag_occurence_treshold]
dataframe['tags'] = dataframe['tags'].progress_apply(lambda x: remove_rare_tags(x, tags))

100%|██████████| 192367/192367 [00:05<00:00, 38061.50it/s]


In [34]:
mlb = MultiLabelBinarizer()
mlb.fit(dataframe['tags'])
y = mlb.transform(dataframe['tags'])

In [35]:
x_train, x_test, y_train, y_test = train_test_split(dataframe['text'], y, test_size=0.2, random_state=42)

In [36]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10_000)
x_train_vectorized = tfidf_vectorizer.fit_transform(x_train)
x_test_vectorized = tfidf_vectorizer.transform(x_test)

In [37]:
print(x_train_vectorized.shape)
print(y_train.shape)

(153893, 10000)
(153893, 1215)


In [None]:
classifier = OneVsRestClassifier(LogisticRegression())
classifier.fit(x_train_vectorized, y_train)

In [41]:
print(f'Train score: {classifier.score(x_train_vectorized, y_train): .2%}')
print(f'Test score: {classifier.score(x_test_vectorized, y_test): .2%}')

Train score:  2.96%
Test score:  2.97%
