In [1]:
import pandas as pd
import numpy as np

## Text Preprocessing

Before the text is fed into the classifier, the following steps are taken:

1. Detect language
2. Translate into English
3. Lemmatization
4. Remove punctuation but not numbers
5. Stop word removal
6. Join strings separated by a space
7. Converts the string into lower case characters
8. Converts everything into a string

In [2]:
import nltk

from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/deepl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/deepl/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Preprocessing Functions

In [3]:
nltk_wordnet_tag_map = {
    'NN': wn.NOUN,
    'NNS': wn.NOUN,
    'VBP': wn.VERB,
    'VBG': wn.VERB,
    'JJ': wn.ADJ,
}

from functools import reduce

def compose(*functions):
    def compose2(f1, f2):
        """Compose two functions"""
        return lambda *args: f1(f2(*args))
    return reduce(compose2, functions)

def translate_to_english_txt(row):
    text = row["excerpt"]
    try:
        if langid.classify(text)[0] != 'en':
            trans = googletrans.client.Translator()
            return trans.translate(text, 'en').text
        return text
    except Exception as e:
        return ''
    
def lemmatize(row, lemmatizer=WordNetLemmatizer()):
    text = row
    splitted = text if type(text) == list else str(text).split()
    splitted = list(map(lambda x: str(x).lower(), splitted))
    tagged = nltk.pos_tag(splitted)
    lemmatized = []
    for word, tag in tagged:
        wnet_tag = nltk_wordnet_tag_map.get(tag)
        if wnet_tag:
            lemmatized.append(lemmatizer.lemmatize(word, wnet_tag))
        else:
            lemmatized.append(word)
    return ' '.join(lemmatized)

def rm_punc_not_nums(inp, col=None):
    """Remove punctuation unless it's a number for either a df (and col)
    or single entry
    """
    punc = string.punctuation
    transtable = str.maketrans("", "", punc)

    def sing_rm(phr):
        """Remove for a single entity"""
        return ' '.join([re.sub('\W+', '', i).translate(transtable) if not (
                    all(j.isdigit() or j in punc for j in i)
                    and
                    any(j.isdigit() for j in i)
                ) else re.sub('\W+', '', i)
                for i in str(phr).split(' ')]
        )
    if col and isinstance(inp, pd.core.frame.DataFrame):
        return inp.filter(like=col).applymap(lambda phr: sing_rm(phr))
    elif isinstance(inp, str):
        return sing_rm(inp)
    else:
        raise Exception('Not a vaild type')


def rm_stop_words_txt(txt, swords=nltk.corpus.stopwords.words('english')):
    """ Remove stop words from given text """
    return ' '.join(
        [token for token in str(txt).split(' ')
            if token.lower() not in swords]
    )

## Sanity Check

In [4]:
data = {"excerpt": "The 2 quick brown foxes jumped over the lazy dogs!"}

def preprocess(row):
        inp = row["excerpt"]
        inp = lemmatize(inp)
        func = compose(
            rm_punc_not_nums,
            rm_stop_words_txt,
            ' '.join,
            str.split,
            str.lower,
            str
        )
        
        return func(inp)

preprocess(data)

'2 quick brown fox jumped lazy dogs'

## Model Selection Preparation

In [5]:
df = pd.read_csv("../../data/all_en_processed_sectors_subsectors.csv")

In [6]:
df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)

In [7]:
df["preprocessed_excerpt"] = df.apply(preprocess, axis=1)

In [8]:
merged_df=None
df = pd.DataFrame(df.groupby('preprocessed_excerpt')["sector"].apply(set)).reset_index()

In [9]:
documents = [x.strip() for x in df['preprocessed_excerpt'].values]

In [10]:
# Flatten list
temp_list = [list(x) for x in list(df["sector"])]
category_columns = [j for sub in temp_list for j in sub]
category_columns = list(set(category_columns))
category_columns.sort()

labels_text={}
for i, x in enumerate(category_columns):
    labels_text[x]=i
    
labels_text

labels=[]
for x in temp_list:
    labels.append([labels_text[y] for y in x])
labels=np.array(labels)   
labels

array([list([4]), list([9]), list([3]), ..., list([5]), list([3]),
       list([3])], dtype=object)

### doc2vec

In [47]:
df = pd.read_csv("processed_with_vectors.csv")


In [56]:
#df = 
dfgroup1=pd.DataFrame(df.groupby('preprocessed_excerpt')["sector"].apply(set)).reset_index()
dfgroup2=pd.DataFrame(df.groupby('preprocessed_excerpt')["feature_vector"].apply(list)).reset_index()

In [89]:
vectors=[]
labels=[]
for item1_i, item1 in dfgroup1.iterrows():
    item2 = dfgroup2.iloc[item1_i]
    vector=[x.replace('[','').replace(']','').replace('\n','').split(' ') for x in item2['feature_vector']]
    vectors.append([float(x.strip()) for x in vector[0] if x.strip()!=''])
    labels.append([labels_text[y] for y in item1['sector']])
labels=np.array(labels)  
vectors=np.array(vectors)

In [92]:
import pickle
pickle.dump([vectors, labels], open('vectors_doc2vec_dim50.pkl', 'wb'))

### TFIDF

In [11]:
df=None

In [12]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import TfidfModel

MAX_NB_WORDS=20000

dictionary = Dictionary([x.split() for x in documents])
dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=MAX_NB_WORDS)
dictionary.compactify()
print('Total %s unique tokens.' % len(dictionary))

Total 20000 unique tokens.


In [13]:
corpus = [dictionary.doc2bow(line) for line in [x.split() for x in documents]]  # convert corpus to BoW format

model = TfidfModel(corpus)  # fit model
vectors_tfidf=[]
for x in corpus:
    vectors_tfidf.append(model[x])
corpus = None

In [14]:
vectors_tfidf_sparse=np.zeros((len(vectors_tfidf),MAX_NB_WORDS))
for doc_i, doc in enumerate(vectors_tfidf):
    for item in doc:
        vectors_tfidf_sparse[doc_i][item[0]]=item[1]

In [16]:
documents=None

In [18]:
import pickle
pickle.dump([dictionary, labels_text], open('dictionary_labels_text.pkl', 'wb'))
pickle.dump([vectors_tfidf_sparse, labels], open('vectors_tfidf_sparse.pkl', 'wb'))

In [46]:
# Add categories as columns
#category_df = pd.concat([merged_df,pd.DataFrame(columns=category_columns)])

In [38]:
#category_df.fillna(value=0, inplace=True)

In [47]:
#def update(row):
#    for each in list(row["sector"]):
#        row[each] = 1
#    return row
#category_df = category_df.apply(update, axis=1)

In [48]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

text_clf = Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, 2))),
            ('tfidf', TfidfTransformer(use_idf=False))
        ])

In [20]:
features = text_clf.fit_transform(category_df["preprocessed_excerpt"])
labels = category_df.iloc[:,:12]

In [21]:
labels = np.array(labels)

In [22]:
from sklearn.linear_model import RidgeClassifierCV
import sklearn.model_selection as ms
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [23]:
import matplotlib.pyplot as plt
%matplotlib inline

In [73]:
def model_cv(data, labels, model):
    print ("Cross Validating: {0}".format(model["name"]))
    print ("Data shape", data.shape)
    print ("Labels shape", labels.shape)
    scoring = ['accuracy', 'precision_macro', 'recall_macro']
    scores = cross_validate(model["model"], data, labels, scoring=scoring, cv=ms.KFold(n_splits=2, shuffle = True, random_state=7), return_train_score=False)
    k = list(range(10))
    plt.bar(k, scores['test_accuracy'])
    plt.bar(k, scores['test_precision_macro'])
    plt.bar(k, scores['test_recall_macro'])
    plt.ylabel("Percentage")
    plt.xlabel("Iteration")
    plt.title("{0} 10 Folds Cross Validation".format(model["name"]))
    plt.legend(['Testing Accuracy', 'Testing Precision', 'Testing Recall'], loc='upper left')

In [59]:
labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
models = [
#     {
#         "name": "MLP",
#         "model": MLPClassifier(activation='logistic', learning_rate='adaptive', verbose=True, early_stopping=True)
#     },
    {
        "name": "Ridge Classifier - Exponential Loss",
        "model": RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1])
    },
#     {
#         "name": "Gradient Boosting Classifier - Deviance Loss",
#         "model": GradientBoostingClassifier(loss="deviance", n_estimators=300)
#     },
    
]

for model in models:
    model_cv(features, labels, model)

Cross Validating: Ridge Classifier - Exponential Loss
Data shape (15852, 376429)
Labels shape (15852, 12)
