In [None]:
#!/usr/bin/env python3
from collections import Counter
import pandas as pd

dirname = os.path.dirname(os.getcwd())
data_folder = os.path.join(dirname, 'data')

def get_data_file(folder, file):
    return os.path.join(data_folder, folder, file)

template_path = "data_id_{idx}.csv"


def read(idx):
  data = pd.read_csv(
      template_path.format(idx),
      sep=',',
      header=0,
      dtype={
          "word": str,
          "term_frequency": int,
          "document_frequency": int,
      },
      usecols=["word", "term_frequency", "document_frequency"],
  )

  term_counter = Counter({
      data["word"][index]: data["term_frequency"][index] for index in data.index
  })

  document_frequency_counter = Counter({
      data["word"][index]: data["document_frequency"][index]
      for index in data.index
  })
  return term_counter, document_frequency_counter


parts = [read(year, i) for i in range(64) for year in range(2015, 2017 + 1)]

term_counter = parts[0][0].copy()
document_frequency_counter = parts[0][1].copy()
for term_freq, doc_freq in parts[1:]:
  term_counter += term_freq
  document_frequency_counter += doc_freq

df = pd.DataFrame([(word, count, document_frequency_counter[word])
                   for word, count in term_counter.items()],
                  columns=["word", "term_frequency", "document_frequency"])

df = df.sort_values(["word"])
df.dropna(inplace=True)
df.set_index("word", inplace=True)
df.to_csv("data/word_counts/total_count.csv")