### Multilabel Classification Training and Testing Data Preperation

***
This Notebook is to generate training and testing set for multilabel classification problem. The final dataframe includes categoties, texts and labels. Topics are grouped based on a category topics mapping generated from the 'cat-mapping.txt' extracted from 'cat-descriptions_120396.txt'.
***
- Data Preprocessing
- Topics and Categories Mapping Generation
- Category and Label Construction

In [1]:
import numpy as np
import pandas as pd
import json

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import nltk

In [2]:
# read data
reuters_bert_train = pd.read_csv('result/reuters_bert_train.csv')
reuters_bert_test = pd.read_csv('result/reuters_bert_test.csv')

In [3]:
reuters_bert_test.shape

(3299, 3)

In [4]:
reuters_bert_test.head()

Unnamed: 0,id,topics,texts
0,16002,['acq'],hospital corp says it received 47 dlr a share ...
1,16003,['earn'],beverly enterprises <bev> sets regular dividen...
2,16004,['money-fx'],treasury's baker says floating exchange rate s...
3,16005,['crude'],"crude oil netbacks up sharply in europe, u.s.c..."
4,16006,['money-fx'],treasury's baker says system needs stabilitytr...


In [5]:
from ast import literal_eval

def data_preprocess(df):
    # convert topics to list
    df.topics = df.topics.apply(literal_eval)

    # remove rows with empty topics
    df =df[df['topics'].map(lambda d: len(d)) > 0]

    # reset index
    df = df.reset_index(drop = True)
    
    return df


In [6]:
reuters_bert_train = data_preprocess(reuters_bert_train)
reuters_bert_test = data_preprocess(reuters_bert_test)

## Mapping Topics to Categories

Group 135 topics to 9 categories:  
'money-fx', 'ship', 'interest', 'economic_indicator', 'currency', 'acq','earn', 'commodity', 'energy'

In [7]:
def cat_top_mapping(file_path):
    mapping = {'money-fx':['money-fx'],'ship':['ship'],'interest':['interest']}
    with open(file_path, 'r') as file:
        lines = file.readlines()

        for line in lines:
            line = re.sub(r'\n', "", line)
            line = line.strip()
            if line != "":
    #             print(line)
                # generate key
                if line.startswith('**'):
                    category = '_'.join(line[2:].split(' ')[:-2]).lower()
                    mapping[category] = []
                # generate values
                elif '(' in line:
                    line = line.split(' ')[-1]
                    line = re.sub('[()]', '', line)
                    mapping[category].append(line.lower())
                else:
                    mapping[category].append(line.lower())
    return mapping


In [8]:
file_path = 'reuters21578/cat-mapping.txt'
cat_top_map = cat_top_mapping(file_path)

In [9]:
cat_top_map['acq'] = 'acq'
cat_top_map['earn'] = 'earn'
del cat_top_map['corporate']

In [10]:
cat_top_map.keys()

dict_keys(['money-fx', 'ship', 'interest', 'economic_indicator', 'currency', 'commodity', 'energy', 'acq', 'earn'])

In [11]:
print(len(cat_top_map))

9


## Add category
- add categories
- encoding categories

In [12]:
reuters_bert_train.head()

Unnamed: 0,id,topics,texts
0,4005,"[interest, retail, ipi]",u.s. economic data key to debt futures outlook...
1,4012,[earn],bank of british columbia 1st qtr jan 31 netope...
2,4014,[earn],restaurant associates inc <ra> 4th qtr jan 3sh...
3,4015,[earn],michigan general corp <mgl> 4th qtrshr loss 1....
4,4016,"[crude, nat-gas, iron-steel]","usx <x> proved oil, gas reserves fall in 1986u..."


Add categories

In [13]:
# add category columns
def add_category(cat_top_map, topics):
    category = set()
    for topic in topics:
        for key in cat_top_map:
            if topic in cat_top_map[key]:
                category.add(key) 
    return list(category)

# add categories
reuters_bert_train['categories'] = reuters_bert_train['topics'].apply(lambda x: add_category(cat_top_map, x))
reuters_bert_test['categories'] = reuters_bert_test['topics'].apply(lambda x: add_category(cat_top_map, x))



Encoding categories and Add labels

In [14]:
def cat_encoding(topics,total_category):
    category = [0 for _ in range(9)]
    for idx,value in enumerate(total_category):
        if value in topics:
            category[idx] = 1
    return category

In [15]:
total_category = list(cat_top_map.keys())
reuters_bert_train['labels'] = reuters_bert_train['categories'].apply(lambda x: cat_encoding(x,total_category))
reuters_bert_test['labels'] = reuters_bert_test['categories'].apply(lambda x: cat_encoding(x,total_category))


In [16]:
reuters_bert_test.head()

Unnamed: 0,id,topics,texts,categories,labels
0,16002,[acq],hospital corp says it received 47 dlr a share ...,[acq],"[0, 0, 0, 0, 0, 0, 0, 1, 0]"
1,16003,[earn],beverly enterprises <bev> sets regular dividen...,[earn],"[0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,16004,[money-fx],treasury's baker says floating exchange rate s...,[money-fx],"[1, 0, 0, 0, 0, 0, 0, 0, 0]"
3,16005,[crude],"crude oil netbacks up sharply in europe, u.s.c...",[energy],"[0, 0, 0, 0, 0, 0, 1, 0, 0]"
4,16006,[money-fx],treasury's baker says system needs stabilitytr...,[money-fx],"[1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [62]:
def category_distribution(df):
    cat_distribution = {}
    for category in list(df['categories']):
        for cat in category:
            if cat not in cat_distribution:
                cat_distribution[cat] = 1
            else:
                cat_distribution[cat] += 1
    data = pd.DataFrame.from_dict(cat_distribution, orient='index')   
    data.columns = ['Total']
    return data.sort_values(by='Total', ascending=False)

In [63]:
train_distribution = category_distribution(reuters_bert_train)
test_distribution = category_distribution(reuters_bert_test)

### Simple Visualization with Plotly

In [73]:
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='Train', x=train_distribution.index, y=train_distribution.Total),
    go.Bar(name='Test', x=test_distribution.index, y=test_distribution.Total)
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(title_text='Category Distribution among ')
fig.show()

In [17]:
reuters_bert_train.to_csv('result/reuters_multilabel_train.csv')
reuters_bert_test.to_csv('result/reuters_multilabel_test.csv')

### Check

In [18]:
topics_size = reuters_bert_train['categories'].apply(lambda x: len(x))
topics_size.describe()

count    7775.000000
mean        1.076013
std         0.284239
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         4.000000
Name: categories, dtype: float64

In [19]:
# unique number of categories
res = set()
for category in list(reuters_bert_train['categories']):
    for i in category:
        res.add(i)
        

In [20]:
list(res)

['money-fx',
 'ship',
 'acq',
 'economic_indicator',
 'commodity',
 'currency',
 'interest',
 'energy',
 'earn']

In [21]:
a = ['economic_indicator', 'interest']


In [22]:
total_cat = list(cat_top_map.keys())
cat_encoding(a,total_cat)

[0, 0, 1, 1, 0, 0, 0, 0, 0]