In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from skopt.callbacks import VerboseCallback

from tensorflow.keras.models import Model, load_model
from gensim.models import KeyedVectors

import os
import json
from joblib import Parallel, delayed

In [2]:
pet_train   = pd.read_csv('Data/train/train.csv')
pet_train = pet_train.drop(pet_train.columns.difference(['PetID', 'Description']), axis=1)

In [3]:
seed = 42

model = KeyedVectors.load_word2vec_format('Data\crawl-300d-2M.vec\crawl-300d-2M.vec')

pet_ids = pet_train[~pet_train['Description'].isna()]['PetID']
vects = []
found_pet_ids = []
for pet_id in pet_ids:
    desc = pet_train[pet_train['PetID'] == pet_id]['Description'].values[0].split(' ')
    word_vectors = []
    for word in desc:
        try:
            word_vectors.append(model.get_vector(word))
        except KeyError:
            pass
    if word_vectors:
        mean_vect = np.mean(word_vectors, axis=0)
        vects.append(mean_vect)
        found_pet_ids.append(pet_id)
fasttext_train_df = pd.DataFrame(np.array(vects)).add_prefix('fasttext_')
fasttext_train_df['PetID'] = found_pet_ids
train_merged = pd.merge(pet_train, fasttext_train_df, how='left', on='PetID')

In [4]:
text_columns = ['Description']

# Fill nans with empty text
train_merged[text_columns] = train_merged[text_columns].fillna('')

# Text feature extractor class
# We use TF-IDF vectorizer and then extract SVD and NMF vectors with 13 components each

class TextFeatureExtractor():
    """Extracts text features from text columns."""
    def __init__(self, n_components):
        self.tfidf = TfidfVectorizer(min_df=2, max_features=None,
                          strip_accents='unicode', analyzer='word', token_pattern='\w+',
                          ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)
        self.svd = TruncatedSVD(n_components=n_components, random_state=seed)
        self.nmf = NMF(n_components=n_components, random_state=seed)
        
    def fit_transform(self, X_text):
        text_features = []
        for col in X_text.columns:
            tfidf_col = self.tfidf.fit_transform(X_text[col])
            
            svd_col = self.svd.fit_transform(tfidf_col)
            svd_col = pd.DataFrame(svd_col)
            svd_col = svd_col.add_prefix('SVD_{}_'.format(col))
            text_features.append(svd_col)
            
            nmf_col = self.nmf.fit_transform(tfidf_col)
            nmf_col = pd.DataFrame(nmf_col)
            nmf_col = nmf_col.add_prefix('NMF_{}_'.format(col))
            text_features.append(nmf_col)
            
        text_features = pd.concat(text_features, axis=1)
        
        return text_features
    
    def transform(self, X_text):
        text_features = []
        for col in X_text.columns:
            tfidf_col = self.tfidf.transform(X_text[col])
            
            svd_col = self.svd.transform(tfidf_col)
            svd_col = pd.DataFrame(svd_col)
            svd_col = svd_col.add_prefix('SVD_{}_'.format(col))
            text_features.append(svd_col)
            
            nmf_col = self.nmf.transform(tfidf_col)
            nmf_col = pd.DataFrame(nmf_col)
            nmf_col = nmf_col.add_prefix('NMF_{}_'.format(col))
            text_features.append(nmf_col)
            
        text_features = pd.concat(text_features, axis=1)
        
        return text_features

    
text_feature_extractor = TextFeatureExtractor(n_components=13)

In [8]:
train_merged = train_merged.drop('Description', axis=1)

In [9]:
train_merged.to_parquet('Procesado/train_text.parquet')

0
1
2
3
4
