<a href="https://colab.research.google.com/github/zartuyt/DataExtractor/blob/main/DataExtractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!git clone https://github.com/zartuyt/DataExtractor

Cloning into 'DataExtractor'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 7 (delta 1), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (7/7), 9.56 KiB | 9.56 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [6]:
import os
os.chdir('DataExtractor')

In [7]:
invoices_new_path = 'invoices_new.pkl'
expired_invoices_path = 'expired_invoices.txt'

In [14]:
import pandas as pd
import pickle

class DataExtractor:
    def __init__(self, invoices_new_path, expired_invoices_path):
        self.invoices_new_path = invoices_new_path
        self.expired_invoices_path = expired_invoices_path
        self.type = {0: 'Material', 1: 'Equipment', 2: 'Service', 3: 'Other'}

    def load_new_invoices(self):
        with open(self.invoices_new_path, 'rb') as file:
            self.data = pickle.load(file)

    def load_expired_invoices(self):
        with open(self.expired_invoices_path, 'r') as file:
            self.expired_invoices = set(map(int, map(str.strip, file.read().split(','))))

    def convert_to_int(self, value):
        try:
            return int(value)
        except ValueError:
            return None

    def transform_data(self):
      flattened_data = []

      for invoice in self.data:
          invoice_id_str = str(invoice['id'])
          invoice_id = int(''.join(filter(str.isdigit, invoice_id_str)))

          # Handle invalid dates
          try:
              created_on = pd.to_datetime(invoice['created_on'])
          except (ValueError, pd.errors.OutOfBoundsDatetime):
              continue

          # Check if 'items' key exists in the invoice
          if 'items' not in invoice:
              continue

          invoice_total = sum(
              self._parse_price(item['item']['unit_price']) * self._parse_quantity(item['quantity'])
              for item in invoice['items']
          )

          for item in invoice['items']:
              invoiceitem_id = item['item']['id']
              invoiceitem_name = item['item']['name']
              type_ = self._parse_type(item['item']['type'])
              unit_price = self._parse_price(item['item']['unit_price'])
              quantity = self._parse_quantity(item['quantity'])
              total_price = unit_price * quantity
              percentage_in_invoice = total_price / invoice_total
              is_expired = invoice_id in self.expired_invoices

              flattened_data.append({
                  'invoice_id': invoice_id,
                  'created_on': created_on,
                  'invoiceitem_id': invoiceitem_id,
                  'invoiceitem_name': invoiceitem_name,
                  'type': type_,
                  'unit_price': unit_price,
                  'total_price': total_price,
                  'percentage_in_invoice': percentage_in_invoice,
                  'is_expired': is_expired
              })

      flat_df = pd.DataFrame(flattened_data)
      flat_df = flat_df.sort_values(by=['invoice_id', 'invoiceitem_id'])
      return flat_df



    def _parse_price(self, price):
        try:
            return int(price)
        except ValueError:
            return 0

    def _parse_quantity(self, quantity):
        try:
            return int(quantity)
        except ValueError:
            return 0

    def _parse_type(self, type_):
        try:
            return self.type[int(type_)]
        except (ValueError, KeyError):
            return 'Other'

    def run(self):
        self.load_new_invoices()
        self.load_expired_invoices()
        transformed_data = self.transform_data()
        transformed_data.to_csv('transformed_invoices.csv', index=False)
        return "CSV file saved successfully."

extractor = DataExtractor(invoices_new_path, expired_invoices_path)
result = extractor.run()
print(result)


CSV file saved successfully.
