In [None]:
import pandas as pd
import numpy as np
import os
import ast

## Clicks & Impressions (Relationships)

In [None]:
# read the impressions from training only
col_names = ['impressionId', 'userId', 'timeString', 'history', 'impressions']
impression_df_train = pd.read_csv('./data/train/behaviors.tsv', sep = '\t', header = None, names = col_names)
impression_df_validate = pd.read_csv('./data/validate/behaviors.tsv', sep = '\t', header = None, names = col_names)

impression_df_train['splitSet'] = 'TRAIN'
impression_df_validate['splitSet'] = 'VALIDATION'
impression_df = pd.concat([impression_df_train, impression_df_validate], ignore_index=True)
#impression_df 

In [None]:
impression_df['time'] = pd.to_datetime(impression_df.timeString, infer_datetime_format=True)
#impression_df

In [None]:
def listify(x):
    res = None
    if not pd.isna(x):
        res = x.split(' ')
    return res

impression_df['historyList'] = impression_df.history.apply(listify)
impression_df['impressionList'] = impression_df.impressions.apply(listify)
#impression_df

In [None]:
# Q is hustory consistently included or are earlier clicks cut off?
historic_df = impression_df.groupby('userId').agg({'impressionId': 'count', 'splitSet':'min', 
                                                   'history': lambda x: set(list(x))}) \
    .reset_index().sort_values(by = 'impressionId', ascending=False)
historic_df['numUniqueHistories'] = historic_df.history.apply(len)
historic_df[historic_df.numUniqueHistories > 1] 
#interestingly all impression histories are the same for each user. 

In [None]:
def listify_from_set(x):
    return ' '.join(filter(lambda i: not pd.isnull(i), x)).split(' ')
historic_df['historyList'] = historic_df.history.apply(listify_from_set)
historic_df.drop(columns = ['impressionId', 'history', 'numUniqueHistories'], inplace=True)
historic_df = historic_df.explode('historyList')
historic_df.rename(columns = {'historyList':'newsId'}, inplace=True)
historic_df 

In [None]:
impression_click_df = impression_df.loc[~impression_df.impressionList.isnull(),
    ['impressionId', 'userId', 'time', 'impressionList', 'splitSet']] \
    .explode('impressionList')
impression_click_df.rename(columns = {'impressionList':'newsIdHit'}, inplace=True)
impression_click_df[["newsId", "userClickedCode"]] = impression_click_df.newsIdHit.str.split('-', expand=True)
impression_click_df.drop(columns = ['newsIdHit'], inplace=True)
impression_click_df

In [None]:
clicked_df = impression_click_df[impression_click_df.userClickedCode == '1'].drop(columns = ['userClickedCode'])
did_not_click_df = impression_click_df[impression_click_df.userClickedCode == '0'].drop(columns = ['userClickedCode'])


In [None]:
output_path = './data/clean'
if not os.path.exists(output_path):
    os.mkdir(output_path)
    print(f'Created new directory: {output_path}')
else:
    print(f'Directory {output_path} already exists')

In [None]:
historic_df.to_csv(f'{output_path}/historic-clicks.csv', index = False)
clicked_df.to_csv(f'{output_path}/clicks.csv', index = False)
did_not_click_df.to_csv(f'{output_path}/did-not-click.csv', index = False)

## Adding Users

In [None]:
user_df = impression_df[['userId']].drop_duplicates()
user_df

In [None]:
user_df.to_csv(f'{output_path}/users.csv', index = False)

## News

In [None]:
col_names = ['newsId', 'category', 'subCategory', 'title', 'abstract', 'url', 'titleEntities','abstractEntities']
news_train_df = pd.read_csv('./data/train/news.tsv', sep = '\t', header = None, names = col_names)
#news_train_df.drop(columns = ['titleEntities', 'abstractEntites'], inplace=True)
news_validate_df = pd.read_csv('./data/validate/news.tsv', sep = '\t', header = None, names = col_names)
# news_validate_df.drop(columns = ['titleEntities', 'abstractEntites'], inplace=True)

### Wiki Title Entities

In [None]:
news_and_entities_df = pd.concat([news_train_df, news_validate_df],ignore_index=True).drop_duplicates()

In [None]:
raw_title_entity_df = news_and_entities_df.drop(columns = ['category', 'subCategory', 'title', 'abstract', 'url', 'abstractEntities'])

In [None]:
def get_entities(entity_string):
    entity_list = []
    if not pd.isna(entity_string):
        entity_list = ast.literal_eval(entity_string)
    return entity_list

raw_title_entity_df['titleEntitiesFormatted'] = raw_title_entity_df.titleEntities.apply(get_entities)

In [None]:
raw_title_entity_df = raw_title_entity_df.explode('titleEntitiesFormatted', ignore_index=True)
raw_title_entity_df = raw_title_entity_df.drop(columns = ['titleEntities']).dropna().reset_index(drop=True)
#raw_title_entity_df

In [None]:
title_ent_df = pd.json_normalize(raw_title_entity_df.titleEntitiesFormatted)
#title_ent_df

In [None]:
title_entity_rel_df = pd.concat([title_ent_df, raw_title_entity_df[['newsId']]], axis=1)
#title_entity_rel_df

In [None]:

#title_entity_rel_df.hist('Confidence')

In [None]:
title_entity_df = title_entity_rel_df \
    .drop(columns = ['Confidence', 'OccurrenceOffsets', 'SurfaceForms', 'newsId']).drop_duplicates()
#title_entity_df

### wiki abstract Entities

In [None]:
raw_abstract_entity_df = news_and_entities_df.drop(columns = ['category', 'subCategory', 'title', 'abstract', 'url', 'titleEntities'])

In [None]:
def get_entities(entity_string):
    entity_list = []
    if not pd.isna(entity_string):
        entity_list = ast.literal_eval(entity_string)
    return entity_list

raw_abstract_entity_df['abstractEntitiesFormatted'] = raw_abstract_entity_df.abstractEntities.apply(get_entities)

In [None]:
raw_abstract_entity_df = raw_abstract_entity_df.explode('abstractEntitiesFormatted', ignore_index=True)
raw_abstract_entity_df = raw_abstract_entity_df.drop(columns = ['abstractEntities']).dropna().reset_index(drop=True)
#raw_abstract_entity_df

In [None]:
abstract_ent_df = pd.json_normalize(raw_abstract_entity_df.abstractEntitiesFormatted)
#abstract_ent_df

In [None]:
abstract_entity_rel_df = pd.concat([abstract_ent_df, raw_abstract_entity_df[['newsId']]], axis=1)
#abstract_entity_rel_df

In [None]:

#abstract_entity_rel_df.hist('Confidence')

In [None]:
abstract_entity_df = abstract_entity_rel_df \
    .drop(columns = ['Confidence', 'OccurrenceOffsets', 'SurfaceForms', 'newsId']).drop_duplicates()
#abstract_entity_df

### combine and write wiki entities

In [None]:
entity_df = pd.concat([title_entity_df, abstract_entity_df], ignore_index=True)
#entity_df

In [None]:
entity_df = entity_df.drop_duplicates()
#entity_df

In [None]:
#we have some odd duplicates.  Lets just remove these for now
dup_wiki_ids = entity_df.WikidataId[entity_df.WikidataId.duplicated()].tolist()
entity_df.loc[entity_df.WikidataId.isin(dup_wiki_ids)].sort_values('WikidataId')

In [None]:
title_entity_rel_df = title_entity_rel_df[~title_entity_rel_df.WikidataId.isin(dup_wiki_ids)]
#title_entity_rel_df
title_entity_rel_df.to_csv(f'{output_path}/title-entity-rel.csv', index = False)

In [None]:
abstract_entity_rel_df = abstract_entity_rel_df[~abstract_entity_rel_df.WikidataId.isin(dup_wiki_ids)]
#abstract_entity_rel_df
abstract_entity_rel_df.to_csv(f'{output_path}/abstract-entity-rel.csv', index = False)

In [None]:
abstract_entity_rel_df

In [None]:
entity_df = entity_df[~entity_df.WikidataId.isin(dup_wiki_ids)]
entity_df.to_csv(f'{output_path}/entities.csv', index = False)

### News Articles

In [None]:
news_df = news_and_entities_df.drop(columns = ['titleEntities', 'abstractEntities'])

In [None]:
news_df

In [None]:
news_df.loc[news_df.abstract.isna(), 'abstract'] = ''

In [None]:
#news_df.abstract = news_df.abstract.str.replace(',', ' -')
news_df.abstract = news_df.abstract.str.replace('"', "'")

#news_df.title = news_df.title.str.replace(',', ' -')
news_df.title = news_df.title.str.replace('"', "'")

In [None]:
# a couple manual fixes
news_df.loc[news_df.newsId == 'N18259', 'abstract'] = "He'll be in better shape physically and he'll be in a better place mentally too,'' coach Freddie Kitchens said of Mayfield"
news_df.loc[news_df.newsId == 'N18259', 'url'] = 'https://assets.msn.com/labs/mind/AAIPbTZ.html'

In [None]:
# a couple manual fixes
news_df.loc[news_df.newsId == 'N16590', 'abstract'] = "We're seeing really disturbing things coming out of the transcripts... the most disturbing I thought, aside from the open extortion of a foreign government, was the fact that the secretary of state had to call Sean Hannity to find out how the president was feeling about an ambassador, Jess McIntosh says. Tara Dowdell and Max Boot join the conversation."
news_df.loc[news_df.newsId == 'N16590', 'url'] = 'https://assets.msn.com/labs/mind/BBWymjv.html'



In [None]:
news_df.to_csv(f'{output_path}/news.csv', index = False)

## Entity Embeddings

In [None]:
col_names = ['wikiEntityId'] + [f'x_{i}' for i in range(101)]
entity_emb_train_df = pd.read_csv('./data/train/entity_embedding.vec', sep = '\t', header = None, names = col_names)
entity_emb_validate_df = pd.read_csv('./data/validate/entity_embedding.vec', sep = '\t', header = None, 
                                     names = col_names)
print(entity_emb_train_df.shape[0] + entity_emb_validate_df.shape[0])

In [None]:
entity_emb_df = pd.concat([entity_emb_train_df, entity_emb_validate_df],ignore_index=True)

In [None]:
entity_emb_df.drop(inplace=True, columns=['x_100'])
entity_emb_df

In [None]:
mean_embeddings = []
for i in range(100):
    mean_embeddings.append(entity_emb_df[f'x_{i}'].mean())
entity_emb_df[[f'x_{i}' for i in range(100)]].mean()

In [None]:
entity_emb_df['entityEmbedding'] = entity_emb_df.apply(lambda row:";".join(['%0.7f' % i for i in row.iloc[1:]]), 
                                                       axis=1)
entity_emb_df.drop(inplace=True, columns = [f'x_{i}' for i in range(100)])

In [None]:
entity_emb_df = entity_emb_df.drop_duplicates()
entity_emb_df

In [None]:
entity_emb_df.wikiEntityId.duplicated().sum()

In [None]:
entity_emb_df.to_csv(f'{output_path}/entity-embedding.csv', index = False)

In [None]:
entity_emb_df