In [1]:
import sys
sys.path.append("..")

import logging 
logging.basicConfig(level=logging.INFO)

import os
import ast
import smart_open

import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

from tqdm import tqdm
from datetime import datetime
from collections import OrderedDict

from scripts.src.data_reading import *

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


In [5]:
def create_df():
    
    root = "rcv1.tar/rcv1/"

    files = []

    for (dirpath, dirnames, filenames) in os.walk(root):
        for name in filenames:
            path = "{}/{}".format(dirpath, name)
            files.append(path)

    item_id = []
    cat_list = []
    date_list = []
    raw_text = []

    for file in tqdm(files[1:-8]):
        tree = ET.parse(file)
        d = {}
        d['text'] = ''

        for child in tree.getiterator():            
            for k, v in child.attrib.items():
                if k not in d:
                    d[k] = [v]
                else:
                    d[k].append(v)
            try:
                if len(child.text.strip()) > 1:
                    d['text'] += child.text
            except:
                pass


        item_id.append(int(d['itemid'][0]))
        date_list.append(d['date'][0])
        cat_list.append(d['code'])
        raw_text.append(d['text'])

    rcv1_text_df = pd.DataFrame(columns = ["doc_id", "text", "date", "categories"])
    rcv1_text_df["doc_id"] = item_id
    rcv1_text_df["text"] = raw_text
    rcv1_text_df["date"] = date_list
    rcv1_text_df["categories"] = cat_list

    save_dest = "rcv1.tar/raw_text_df.tsv"

    rcv1_text_df.to_csv(save_dest, sep='\t', index=False)
    
    return save_dest

# df_file = create_df()

# raw_df = pd.read_csv("rcv1.tar/raw_text_df.tsv", sep="\t", index_col=0) #df_file

In [None]:
# getting list of topic codes
def get_codes(filename):
    topics = []
    with smart_open.smart_open(filename, "r") as t:
        for line in t:
            el = line.strip().split("\t")
            topics.append(el[0])
    
    topic_codes = topics[2:-1]
    return topic_codes

def func(listxlist, major_list):
    res = []
    for ids in listxlist:
        if ids in major_list:
            res.append(ids)
    
    if len(res) == 0:
        res = np.nan
    return res

def generate_cat_hier(filename):
    new = []
    with smart_open.smart_open(filename, "r") as file:
        for line in file:
            el = line.strip().split("parent:")
            parent = el[1].strip().split("child:")
            p = parent[0].strip().split(" ")[0]
            child = parent[1].strip().split(" ")[0]
            if p != "Root" :
                new.append((p, child))
    return new[1:]

def cat_of_n(n):
    sadlist = set()
    ordrd = OrderedDict()

    for p, c in n:
        if p not in sadlist:
            sadlist.add(p)
        if c not in sadlist:
            sadlist.add(c)

    for i, j in enumerate(sadlist):
        ordrd[j] = i

    rev_or = {v:k for k, v in ordrd.items()}
    
    return ordrd, rev_or


In [None]:
def process_raw_df(raw_df):
    
    raw_df["categories"] = raw_df["categories"].apply(lambda x: ast.literal_eval(x))

    n = generate_cat_hier("rcv1.tar/rcv1.topics.hier.orig.txt")

    topic_codes = get_codes("rcv1.tar/rcv1/codes/topic_codes.txt")
    region_codes = get_codes("rcv1.tar/rcv1/codes/region_codes.txt")

    orig, rev = cat_of_n(n)

    cat_file = "rcv1.tar/rcv1_topics_cat_hier.txt"
    file = open(cat_file, "w+")
    for p, c in n:
        string = "{} {}\n".format(orig[p], orig[c])
        file.write(string)
    file.close()

    raw_df["topic_categories"] = ''
    raw_df["region_categories"] = ''
    raw_df["split"] = ''

    raw_df["topic_categories"] = raw_df["categories"].apply(lambda x: func(x, topic_codes))
    raw_df["region_categories"] = raw_df["categories"].apply(lambda x: func(x, region_codes))
    raw_df["date"] = pd.to_datetime(raw_df["date"])

    train_in = pd.Timestamp(year=1996, month=8, day=20)
    train_out = pd.Timestamp(year=1996, month=8, day=31)

    for ix in tqdm(raw_df.index):
        if raw_df.at[ix, "date"] >= train_in and raw_df.at[ix, "date"] <= train_out:
            raw_df.at[ix, "split"] = 'training'
        else:
            raw_df.at[ix, "split"] = 'testing'

    raw_df = raw_df.dropna()

    raw_df = raw_df.drop(columns="Unnamed: 0")

    # convert topic list to topic id list
    raw_df["topic_ids"] = raw_df["topic_categories"].apply(lambda x: [orig[item] for item in x])

    raw_df.to_csv("rcv1.tar/rcv1_ckpt.tsv", sep='\t', index=False)
    
    return raw_df_df

In [3]:
raw_df = pd.read_csv("../rcv1.tar/rcv1_ckpt.tsv", sep='\t', index_col=0)

In [19]:
raw_df.head(4)

Unnamed: 0,doc_id,text,date,categories,topic_categories,region_categories,split,topic_ids
0,2286,MEXICO: Recovery excitement brings Mexican mar...,1996-08-20,"['MEX', 'E11', 'ECAT', 'M11', 'M12', 'MCAT']","['E11', 'ECAT', 'M11', 'M12', 'MCAT']",['MEX'],training,"[101, 32, 1, 18, 37]"
1,2287,USA: Chrysler plans new investments in Latin A...,1996-08-20,"['ARG', 'BRAZ', 'USA', 'I24700', 'I34320', 'I3...","['C24', 'CCAT']","['ARG', 'BRAZ', 'USA']",training,"[82, 33]"
2,2288,"USA: CompuServe reports loss, cutting work for...",1996-08-20,"['USA', 'I83940', 'C15', 'C151', 'CCAT', 'E41'...","['C15', 'C151', 'CCAT', 'E41', 'ECAT', 'GCAT',...",['USA'],training,"[41, 23, 33, 28, 32, 15, 5]"
3,2289,"USA: CompuServe reports loss, cutting work for...",1996-08-20,"['USA', 'I83940', 'C15', 'C151', 'CCAT']","['C15', 'C151', 'CCAT']",['USA'],training,"[41, 23, 33]"


In [9]:
class FTextIter(object):
    def __init__(self, file_path):
        super(FTextIter, self).__init__()
        self.file_path = file_path

    def __iter__(self):
        with smart_open.smart_open(self.file_path, 'r') as fin:
            for line in fin:
                line = preprocess_string(line)
                yield list(line)

In [10]:
def get_text(prefix, df):

    fname = "{}/all_rcv1_raw.txt".format(prefix)
    fe, ex = os.path.splitext(fname)
    
    if not os.path.isfile(fname):
        file = open(fname, "w+", newline='\n')

        for i in tqdm(df.index):
            str_each_line = df.at[i, "text"]
            file.write(str_each_line)

        file.close()

    return fe

In [11]:
def fasttext_generator(fname):
    '''
    prefix: path to save the file
    '''
    fe, ex = os.path.splitext(fname)
    filename = "{}.model".format(fe)

    if not os.path.isfile(filename):
        moo = FastText(size=300, window=3, min_count=1) # hs=0, negative=0, size=300
        moo.build_vocab(sentences=FTextIter(fname))
        total_examples = moo.corpus_count
        moo.train(sentences=FTextIter(fname), total_examples=total_examples, epochs=5)
        moo.save(filename)
    else:
        moo = FastText.load(filename)

    return moo

In [20]:
def gen_doc2vec(df, prefix):
    '''
    prefix: os file path to pickle file/saved models
    '''
    save_dest = "{}/rcv1_fasttextV1.tsv".format(prefix)

    if not os.path.isfile(save_dest):
        logging.info("Generating document vectors...")
        fname = get_text(prefix, df)
        feex = fname + ".txt"
        model = fasttext_generator(feex)
        
        temp_col = []
        for index in tqdm(df.index):
            vec = []
            line = preprocess_string(raw_df.at[index, "text"])
            for word in line:
                vec.append(model.wv[word])
            temp_col.append(np.mean(vec, axis=0))

        df["vec"] = temp_col
        
        result = df.sort_index(0)
        
        result.to_csv(save_dest, sep='\t', index=False)

    else:
        logging.info("Loading exisiting model... ")
        result = pd.read_csv(save_dest, sep='\t', index_col=0)

    return result

In [21]:
res = gen_doc2vec(raw_df, "rcv1.tar") #run this only once

INFO:root:Loading exisiting model... 


In [22]:
grouped = res.groupby("split")
tr = grouped.get_group("training")
te = grouped.get_group("testing")

In [23]:
tr = tr.drop(columns=["categories", "region_categories"])

In [24]:
te = te.drop(columns=["categories", "region_categories"])

In [27]:
tr.shape, te.shape

((23149, 7), (781265, 7))

In [28]:
tr.to_csv("rcv1.tar/rcv1_fasttext_train.tsv", sep='\t', index=False)
te.to_csv("rcv1.tar/rcv1_fasttext_test.tsv", sep='\t', index=False)

In [4]:
tr = pd.read_csv("../rcv1.tar/rcv1_fasttext_train.tsv", sep="\t", index_col=0)

In [5]:
tr

Unnamed: 0_level_0,text,date,topic_categories,split,topic_ids,vec
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2286,MEXICO: Recovery excitement brings Mexican mar...,1996-08-20,"['E11', 'ECAT', 'M11', 'M12', 'MCAT']",training,"[101, 32, 1, 18, 37]",[ 3.52063268e-01 -3.36165130e-01 1.46681756e-...
2287,USA: Chrysler plans new investments in Latin A...,1996-08-20,"['C24', 'CCAT']",training,"[82, 33]",[ 0.54422605 0.23416612 -0.23376958 -0.794583...
2288,"USA: CompuServe reports loss, cutting work for...",1996-08-20,"['C15', 'C151', 'CCAT', 'E41', 'ECAT', 'GCAT',...",training,"[41, 23, 33, 28, 32, 15, 5]",[ 0.49522075 -0.21427244 -0.18388595 -1.068658...
2289,"USA: CompuServe reports loss, cutting work for...",1996-08-20,"['C15', 'C151', 'CCAT']",training,"[41, 23, 33]",[ 0.49522075 -0.21427244 -0.18388595 -1.068658...
2290,USA: Planet Hollywood launches credit card.Pla...,1996-08-20,"['C11', 'C22', 'CCAT']",training,"[67, 45, 33]",[ 0.7525597 0.17336224 -0.40084925 -0.723585...
2291,"USA: Hog prices tumble as supplies increase, c...",1996-08-20,"['M14', 'MCAT']",training,"[87, 37]",[ 0.18626471 -0.01947591 -0.24743521 -0.805329...
2292,USA: Blue chips end up as Fed keeps interest r...,1996-08-20,"['M11', 'M12', 'M13', 'M132', 'M14', 'MCAT']",training,"[1, 18, 29, 84, 87, 37]",[ 1.48425579e-01 -2.19930500e-01 1.36874944e-...
2293,USA: Sprint to offer consumer Internet access ...,1996-08-20,"['C22', 'CCAT']",training,"[45, 33]",[ 9.66169000e-01 -1.08263724e-01 -6.61776140e-...
2294,USA: Back-to-school spending is up.Back-to-sch...,1996-08-20,"['E14', 'ECAT']",training,"[102, 32]",[ 0.34278277 -0.07838031 -0.43072838 -0.694874...
2295,"USA: Kansas, Arizona add to suits against toba...",1996-08-20,"['C12', 'CCAT', 'GCAT', 'GCRIM']",training,"[25, 33, 15, 85]",[ 3.68959397e-01 -1.94391478e-02 -4.86837655e-...
