<a href="https://colab.research.google.com/github/towashington/BERTopic/blob/master/Factiva-Taiwan/functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Python support for HTTPS requests
import requests

## Support for fetching API key from environment variable
import os

## Used in periodically checks
import time

## Used for dataframe processing
import pandas as pd
from pandas import Series

## Used for handling avro format
import fastavro

## Used for calculations
import numpy as np

## Used for visualizations
from IPython.display import display
from IPython.display import HTML
from IPython.display import JSON
import matplotlib.pyplot as plt
pd.set_option('display.notebook_repr_html',True)

##Used for interactions
import ipywidgets as widgets
from ipywidgets import widgets
from ipywidgets import Layout
##from IPython.html import widgets

##Used for working with object related to different file formats
import csv
import json

##Used for Command line option and argument parsing
import argparse

## Used for binary file download
import shutil

##Used for streaming
import io

## May is going to fes

##Used for probing or changing the configuration of the interpreter resources for interacting.
import sys

import datetime
from tqdm import tqdm

In [None]:
class Factiva_Connector:
  def __init__(self, api_key, where_part, format_part):
    self.api_key = api_key
    self.where_part = where_part
    self.format_part = format_part

    self.explain_url = "https://api.dowjones.com/alpha/extractions/documents/_explain"
    self.snapshot_create_url = "https://api.dowjones.com/alpha/extractions/documents/"
    self.account_url = "https://api.dowjones.com/alpha/accounts/{0}".format(api_key)
    self.headers = {
        'user-key': api_key,
        'content-type': "application/json",
        'cache-control': "no-cache",
        'content-encoding' : "UTF-8"
    }
    self.DNA_Query = {"query": {"where": where_part, "format": format_part}}
    self.raw_folder = './raw/'
    self.preproc_folder = './partitioned/'


  def count_articles(self):
    account_response = requests.request("GET", self.account_url, headers=self.headers)

    pd.set_option('display.float_format', lambda x: '%.0f' % x)

    explain_job_status_response = requests.request("POST", self.explain_url, json=self.DNA_Query, headers=self.headers)

    # print(explain_job_status_response.json()["data"]["attributes"]["current_state"])

    explain_job_status_url = explain_job_status_response.json()["links"]["self"]

    explain_job_state = ""
    while explain_job_state != "JOB_STATE_DONE":
        explain_job_response = requests.request("GET", explain_job_status_url, headers=self.headers)
        explain_job_state = explain_job_response.json()["data"]["attributes"]["current_state"]
        time.sleep(10)

    explain_job_response = requests.request("GET", explain_job_status_url, headers=self.headers)

    count = explain_job_response.json()["data"]["attributes"]["counts"]
    self.count = count


  def download_articles(self):
    snapshot_create_job_response = requests.request("POST", self.snapshot_create_url, json=self.DNA_Query, headers=self.headers)

    snapshot_create_job_url = snapshot_create_job_response.json()["links"]["self"]

    snapshot_create_job_url = widgets.Textarea(
        value=snapshot_create_job_url,
        description='Snapshot Create Job URL',
        style = {'description_width': 'initial'},
        layout=Layout(width='98%'),
        disabled=True)
    display(snapshot_create_job_url)

    snapshot_create_job_state = ""
    snapshot_status_job_check_time = 0
    while snapshot_create_job_state != "JOB_STATE_DONE":
        snapshot_create_status_response = requests.request("GET", snapshot_create_job_url.value, headers=self.headers)
        snapshot_create_job_state = snapshot_create_status_response.json()["data"]["attributes"]["current_state"]
        time.sleep(60)
        snapshot_status_job_check_time+=1

    print ('\n\nSnapshot Files ready to download. Creation took approximately ' + str (snapshot_status_job_check_time) + ' minute(s).')

    files = snapshot_create_status_response.json()["data"]["attributes"]["files"]

    if not os.path.exists(self.raw_folder):
        os.makedirs(self.raw_folder)

    for file in files:
        file_for_download_url = file['uri']
        file_response = requests.get(file_for_download_url, headers=self.headers, allow_redirects=True, stream=True)
        url_parts = file_for_download_url.split('/')
        filename = url_parts[len(url_parts) - 2] + "_" +  url_parts[len(url_parts) - 1]
        filesize = file_response.headers['Content-length']
        print ('Downloading file of size ' +  filesize)
        with open(self.raw_folder+filename, 'wb') as local_file:
            for chunk in file_response.iter_content(chunk_size=1000000000):
                local_file.write(chunk)


  def preproc_articles(self):
    file_list = os.listdir(self.raw_folder)
    for file in tqdm(file_list):
        file_name = self.raw_folder + file

        with open(file_name, 'rb') as fp:
          reader = fastavro.reader(fp)
          records = [r for r in reader]
          df = pd.DataFrame.from_records(records)

        df['year_month'] = df['publication_date'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)/1000).strftime("%Y%m"))

        part_name = file[:-len('.avro')][-12:]
        part_number = 'part=' + str(int(part_name))

        # for k, g in df.groupby('year_month'):
        for k, g in df.groupby(['source_code', 'year_month']):
            pub_name = 'pub_name=' + g['source_code'].iloc[0]
            pub_year_month = 'pub_date=' + g['year_month'].iloc[0]
            subfolder = self.preproc_folder + pub_name + '/' + pub_year_month + '/' + part_number + '/'
            if not os.path.exists(subfolder):
                os.makedirs(subfolder)
            destination = subfolder + part_name + '.json'

            out = g.drop(columns=['year_month']).to_json(orient='records', lines=True)
            with open(destination, 'w') as f:
                f.write(out)

            print(subfolder)

42

In [None]:
class AWS_Connector:
  def __init__(self, aws_access_key_id, aws_secret_access_key, aws_region):
    self.aws_access_key_id = aws_access_key_id
    self.aws_secret_access_key = aws_secret_access_key
    self.aws_region = aws_region
    self.folder = './config/'
    self.path = "./config/awscli.ini"

  def initiate_connection(self):
    text = '''
    [default]
    aws_access_key_id = {0}
    aws_secret_access_key = {1}
    region = {2}
    '''.format(self.aws_access_key_id, self.aws_secret_access_key, self.aws_region)

    if not os.path.exists(self.folder):
        os.makedirs(self.folder)

    with open(self.path, 'w') as f:
      f.write(text)

    # !cat config/awscli.ini

    !export AWS_SHARED_CREDENTIALS_FILE = config/awscli.ini

    os.environ['AWS_SHARED_CREDENTIALS_FILE'] = self.path

    # print(os.environ['AWS_SHARED_CREDENTIALS_FILE'])

  def close_connection(self):
    shutil.rmtree(self.folder)