In [1]:
import os
import json
import nltk
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from math import log
from collections import Counter
stem = nltk.PorterStemmer().stem

In [3]:
dataset_path = os.environ['PATH_AKE_DATASETS'] + os.sep + 'datasets/NYTime/src/www.nytimes.com.2019-06-03.jsonl'

### Pre-treat data (do not execute just load instead)

In [4]:
data = []
with open(dataset_path) as f:
    for line in tqdm(f):
        line = json.loads(line)
        id_ = line[0].replace('.html', '').split('/')
        date, cats, id_ =  '/'.join(id_[1:4]), id_[4:-1], id_[-1]
        keywords = [kw.split(';') for kw in line[4] if kw]
        data.append({
            'id': id_, 'categories': cats, 'date': date,
            'title': line[1], 'abstract': line[3],
            'keywords': keywords
        })

FileNotFoundError: [Errno 2] No such file or directory: '/home/gallina/ake-datasets/datasets/NYTime/src/www.nytimes.com.2019-06-03.jsonl'

In [4]:
for d in tqdm(data):
    d['title_tok'] = nltk.word_tokenize(d['title'])
    d['abstract_tok'] = nltk.word_tokenize(d['abstract'])

 61%|██████    | 55347/91013 [04:31<02:53, 206.07it/s]

KeyboardInterrupt: 

In [None]:
sjl = lambda x: ' '.join(map(stem, x)).lower()
tsjl = lambda x: sjl(nltk.word_tokenize(x))

for d in tqdm(data):
    content = d['title_tok'] + d['abstract_tok']
    content = sjl(content)
    d['pr_keywords'] = [
        [v for v in kw if tsjl(v) in content]
        for kw in d['keywords']]
    d['pr_keywords'] = [kw for kw in d['pr_keywords'] if kw]

In [None]:
for d in data:
    d['len'] = len(d['title_tok']) + len(d['abstract_tok'])
    d['len_kw'] = len(d['keywords'])
    d['len_pr_kw'] = len(d['pr_keywords'])

In [None]:
with open('dataaaa.json', 'w') as f:
    json.dump(data, f)

### Load data

In [None]:
with open('dataaaa.json') as f:
    data = json.load(f)

In [None]:
df = pd.DataFrame(data)
df['macro_categories'] = df.categories.map(lambda x: x[0])
df['ratio_present'] = df['len_pr_kw'] / df['len_kw']

In [None]:
df['keywords_sorted'] = df['keywords'].map(lambda x: sorted([sorted(e) for e in x]))
df['pr_keywords_sorted'] = df['pr_keywords'].map(lambda x: sorted([sorted(e) for e in x]))

In [None]:
df['title_str'] = df['title_tok'].map(' '.join)
df['keywords_str'] = df['keywords'].map(str)
# df['pr_keywords_str'] = df['pr_keywords'].map(str)
#df['norm_id'] = df['id'].map(lambda x: re.sub(r'\d+', '#', x)).map(str.lower)
df['abstract_str'] = df['abstract'].map(' '.join())

In [None]:
df[['id', 'len']].groupby('len').count().plot(logx=True)
plt.show()

In [None]:
df[df['len'] < 1300][['id', 'len']].groupby('len').count().plot()
plt.show()

In [None]:
tmp = [(x, len(df[df['len'] < x]) / len(df) * 100) for x in range(0, 3001, 200)]

In [None]:
[(tmp[i][0], round(tmp[i][1], 2), round(tmp[i+1][1] - tmp[i][1], 2)) for i in range(len(tmp)-1)]

In [None]:
df[df['len'] > 3000].groupby('macro_categories')['id'].count()

In [None]:
df[df['macro_categories'] == 'us'][df['len'] > 3000][['id', 'len', 'keywords_sorted']]

In [None]:
def groupby(iterable, key=lambda x: x):
    acc = {}
    for elt in iterable:
        k = key(elt)
        if k not in acc:
            acc[k] = []
        acc[k].append(elt)
    return acc

In [None]:
to_remove_id = set()

## Filter recurring similar articles

In [None]:
# Groupby id (which are not unique) if the number of doc that have unique keyphrase set is
#  larger than 30% keep the docs.
# Doc with similar ids have similar content, if they have similar keyphrase set then they are more similar.
acc = {}
for v, subdf in df.groupby('id'):
    if subdf.shape[0] < 3:
        continue
    nb = subdf['id'].count()
    nb_unique = subdf['keywords_sorted'].map(str).unique().size
    if nb_unique / nb < 0.3:
        filter_ = df['id'] == v
        to_remove_id |= set(df[filter_].index)
        
# {k: (round(nb_unique / nb, 2), nb) for k, (nb, nb_unique) in acc.items() if }

In [None]:
# Stock information
filter_ = df['id'].map(lambda x: 'daily-stock' in x)
to_remove_id |= set(df[filter_].index)
df[filter_]['id'].count(), df[filter_]['keywords_sorted'].map(str).unique().size

In [None]:
# Lottery information
filter_ = (df['id'].map(lambda x: 'lottery-numbers' in x)) & (df['macro_categories'] == 'nyregion')
to_remove_id |= set(df[filter_].index)
df[filter_]['id'].count(), df[filter_]['keywords_sorted'].map(str).unique().size

## Find duplicates

On considère que deux entrée sont égales si le contenu et le titre sont égaux

In [None]:
tmp = df.groupby(['title_str', 'abstract_str'])[['id', 'macro_categories']]
duplicates = []
for a, b in tmp:
    if b.shape[0] > 1:
        duplicates.append(set(b.index))

In [None]:
# Remove duplicates
for d in duplicates:
    if d & to_remove_id:
        # We want to remove one the duplicates, as duplicates are similar to one another we remove them all
        to_remove_id |= d
    else:
        # We remove the duplicates keeping one
        to_remove_id |= set(list(d)[1:])

In [None]:
len(sum(map(list, duplicates), [])) - len(duplicates)

## Filter on length

In [None]:
# Utilisation de l'écart interquartile pour définir une borne supérieure
#  (pandas utilise 1.5 l'IQR là c'est 10 donc on supprime les pire !!!)
#  |--[  ]----------|
#     Q2 Q3         max_len
# max_len = 10 fois l'IQR à partir de Q3
max_len = df['len'].quantile(0.75) + 10 * (df['len'].quantile(0.75) - df['len'].quantile(0.25))
filter_ = df['len'] > max_len
to_remove_id |= set(df[filter_].index)

In [None]:
df.boxplot('len', by='macro_categories')
plt.show()

In [None]:
df.groupby('macro_categories').mean()

In [None]:
1 - df['ratio_present'].mean()

## Filtering !

In [None]:
len(to_remove_id)

In [None]:
df.index[list(to_remove_id)]

In [None]:
df = df.drop(index=to_remove_id)

In [None]:
import random

In [None]:
indexes = list(df.index)
random.shuffle(indexes)

In [None]:
test_id = indexes[:10000]
test = df.loc[test_id]
valid_id = indexes[10000:20000]
valid = df.loc[valid_id]
train_id = indexes[20000:]
train = df.loc[train_id]

In [None]:
test.groupby('macro_categories')['abstract'].count() / test.shape[0] * 100

In [None]:
valid.groupby('macro_categories')['abstract'].count() / valid.shape[0] * 100

In [None]:
train.groupby('macro_categories')['abstract'].count() / train.shape[0] * 100

In [None]:
test[['id', 'categories', 'date', 'title', 'abstract', 'keywords']]