In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [173]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import pickle as pk
import numpy as np
import sklearn.manifold as man
from tensorflow.python.framework import ops

from emoji2vec.model import Emoji2Vec, ModelParams
from emoji2vec.phrase2vec import Phrase2Vec
from emoji2vec.utils import build_kb, get_examples_from_kb, generate_embeddings, get_metrics

from src.constants import (E2V_MAPPING_PATH, EMOJI_2_TOP_INDEX_PATH,E2V_DATA_DIR,EXPORT_DIR,
                           EMBEDDING_TRAINING_DATA_DIR,W2V_PATH)

In [190]:
a = pd.read_csv("../data/raw/e2v/training/",sep="\t",header=None)
a.shape

(500, 3)

In [174]:
dic = pk.load(open(EMOJI_2_TOP_INDEX_PATH,"rb"))

ems = set(dic.keys())

dataset_df = pd.read_csv(EXPORT_DIR.joinpath("data/dataset/emoji_dataset_prod.csv"),index_col=0)

In [175]:
def split_dataset_df(dataset_df,ratios):
    assert(sum(ratios) == 1)
    grouped_df = dataset_df.groupby('emoji')
    train = []
    dev = []
    test = []
    for em in dataset_df['emoji'].unique():
        df = grouped_df.get_group(em).sample(frac=1)
        n = df.shape[0]
        n1 = int(n * ratios[0])
        n2 = int(n * (ratios[0] + ratios[1]))
        train.append(df.iloc[:n1])
        dev.append(df.iloc[n1:n2])
        test.append(df.iloc[n2:])
    train = pd.concat(train,axis=0)
    dev = pd.concat(dev,axis=0)
    test = pd.concat(test,axis=0)
    return train,dev,test

In [176]:
def convert_to_e2v_format(df):
    df = df.copy()
    df['label'] = True
    df = (df[['word','emoji','label']]
          .reset_index(drop=True)
          .sample(frac=1))
    return df

In [177]:
def get_neg_df(df):
    df_neg = df.copy()

    emojis = set(df_neg['emoji'].unique())

    em_vocs = df_neg.groupby('emoji')['word'].agg(lambda x: set(x)).to_dict()

    tot_voc = set.union(*em_vocs.values())

    neg_words = [np.random.choice(list(tot_voc - em_vocs[em]))
                  for em in df_neg['emoji'] ]
    df_neg['word'] = neg_words
    df_neg['label'] = False
    return df_neg

In [195]:
def get_train_val_test(dataset_df):
    """
    Generate the dataframe associated to the right format to train
    a w2v model on the emojis
    
    Args:
        dataset_df (pd.df): dataframe in production format
    
    Return:
        [pd.df]: train dataframe
        [pd.df]: validation dataframe with neg sampling
        [pd.df]: test dataframe with neg sampling
    """
    ratios = (0.8,0.1,0.1)
    train_df, dev_df, test_df = split_dataset_df(dataset_df,ratios)
    
    train_df = convert_to_e2v_format(train_df)
    
    dev_df = convert_to_e2v_format(dev_df)
    dev_df_neg = get_neg_df(dev_df)
    dev_df = (dev_df.append(dev_df_neg)
                    .sample(frac=1))
    
    test_df = convert_to_e2v_format(test_df)
    test_df_neg = get_neg_df(test_df)
    test_df = (test_df.append(test_df_neg)
                    .sample(frac=1))
    
    n_emojis = dataset_df['emoji'].unique().shape[0]
    assert(train_df['emoji'].unique().shape[0] == n_emojis)
    assert(test_df['emoji'].unique().shape[0] == n_emojis)
    assert(dev_df_neg['emoji'].unique().shape[0] == n_emojis)


    return train_df, dev_df, test_df

In [196]:
train_df, dev_df, test_df = get_train_val_test(dataset_df)

In [201]:
train_df.to_csv(EMBEDDING_TRAINING_DATA_DIR.joinpath("train.txt"),sep="\t",header=None,index=False)
dev_df.to_csv(EMBEDDING_TRAINING_DATA_DIR.joinpath("dev.txt"),sep="\t",header=None,index=False)
test_df.to_csv(EMBEDDING_TRAINING_DATA_DIR.joinpath("test.txt"),sep="\t",header=None,index=False)

In [35]:
export_dir = EXPORT_DIR.joinpath("data/embeddings")
export_dir.mkdir(exist_ok=True,parents=True)


# We format the exported file to a train



# Emojis Dataset
word2vec_path = str(W2V_PATH)
mapping_path = str(EMOJI_2_TOP_INDEX_PATH)
data_dir = str(EMBEDDING_TRAINING_DATA_DIR)
embeddings_file = str(export_dir.joinpath("em_dataset_embeddings.pk"))

# Emoji2vec
word2vec_path = str(W2V_PATH)
mapping_path = str(E2V_MAPPING_PATH)
data_dir = str(E2V_DATA_DIR.joinpath("training/"))
embeddings_file = str(export_dir.joinpath("e2v_embeddings.pk"))


in_dim = 300   # Length of word2vec vectors
out_dim = 300  # Desired dimension of output vectors
pos_ex = 4
neg_ratio = 1
max_epochs = 40
dropout = 0.0

params = ModelParams(in_dim=in_dim, out_dim=out_dim, pos_ex=pos_ex, max_epochs=max_epochs,
                    neg_ratio=neg_ratio, learning_rate=0.001, dropout=dropout, class_threshold=0.5)


#ckpt_path = './results/unicode/k-300_pos-4_rat-1_ep-40_dr-1/model.ckpt'
ckpt_path = './results/unicode/k-300_pos-4_rat-1_ep-80_dr-1/model.ckpt'

e2v_path = "./results/unicode/k-300_pos-4_rat-1_ep-80_dr-1/emoji2vec.bin"
# e2v_path = params.model_folder('unicode') + '/emoji2vec.bin'
# Y.
#ckpt_path = './results/model.ckpt'
# e2v_path = "pre-trained/emoji2vec.bin"

print(e2v_path)

./results/unicode/k-300_pos-4_rat-1_ep-80_dr-1/emoji2vec.bin


In [37]:
print('reading training data from: ' + data_dir)
train_kb, ind2phr, ind2emoj = build_kb(data_dir)

pk.dump(ind2emoj, open(mapping_path, 'wb'))

TypeError: must be str, not PosixPath