In [1]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.0-py2.py3-none-any.whl (250 kB)
     -------------------------------------- 250.0/250.0 kB 2.6 MB/s eta 0:00:00
Collecting et-xmlfile
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.0


Working with Excel Files in Python - https://www.python-excel.org/

In [71]:
import os
import pandas as pd
import openpyxl
import time
from openpyxl.utils.dataframe import dataframe_to_rows

In [4]:
PATH = "../../data/excel/"

# BASIC Function

## Create New Excel workbook and new sheet

In [5]:
workbook = openpyxl.Workbook()
sheet = workbook.active

## Add data to sheet - simple

In [26]:
sheet["A1"] = "hello"
sheet["B1"] = "world!"
c1 = sheet.cell(row = 2, column = 1)
c1.value="new row - variant 2"
sheet.cell(row = 2, column = 2).value="new col"

## Save workbook in NEW Excel file

In [62]:
workbook.save(filename=f"{PATH}hello_world.xlsx")

## Reading Excel Spreadsheets

In [79]:
workbook = openpyxl.load_workbook(filename=f"{PATH}hello_world.xlsx")

## Simple info about Spreadsheets

In [29]:
print(workbook.sheetnames)
print(sheet := workbook.active)
print(sheet.title)

['Sheet']
<Worksheet "Sheet">
Sheet


## GET cells and data from cells

In [30]:
sheet["A1"]

<Cell 'Sheet'.A1>

In [31]:
sheet["A1"].value

'hello'

In [32]:
sheet["F10"].value

In [33]:
sheet.cell(row=1, column=2)

<Cell 'Sheet'.B1>

In [34]:
sheet.cell(row=1, column=2).value

'world!'

In [35]:
sheet["A1:C2"]

((<Cell 'Sheet'.A1>, <Cell 'Sheet'.B1>, <Cell 'Sheet'.C1>),
 (<Cell 'Sheet'.A2>, <Cell 'Sheet'.B2>, <Cell 'Sheet'.C2>))

In [36]:
sheet["A:B"]

((<Cell 'Sheet'.A1>,
  <Cell 'Sheet'.A2>,
  <Cell 'Sheet'.A3>,
  <Cell 'Sheet'.A4>,
  <Cell 'Sheet'.A5>,
  <Cell 'Sheet'.A6>,
  <Cell 'Sheet'.A7>,
  <Cell 'Sheet'.A8>,
  <Cell 'Sheet'.A9>,
  <Cell 'Sheet'.A10>),
 (<Cell 'Sheet'.B1>,
  <Cell 'Sheet'.B2>,
  <Cell 'Sheet'.B3>,
  <Cell 'Sheet'.B4>,
  <Cell 'Sheet'.B5>,
  <Cell 'Sheet'.B6>,
  <Cell 'Sheet'.B7>,
  <Cell 'Sheet'.B8>,
  <Cell 'Sheet'.B9>,
  <Cell 'Sheet'.B10>))

In [37]:
sheet[5]

(<Cell 'Sheet'.A5>,
 <Cell 'Sheet'.B5>,
 <Cell 'Sheet'.C5>,
 <Cell 'Sheet'.D5>,
 <Cell 'Sheet'.E5>,
 <Cell 'Sheet'.F5>)

In [38]:
[x.value for x in sheet[1] ]

['hello', 'world!', None, None, None, None]

In [80]:
print("Total Rows:", sheet.max_row)
print("Total Columns:", sheet.max_column)

Total Rows: 14
Total Columns: 6


In [93]:
workbook["Sheet"].dimensions

'A1:C14'

In [81]:
for i in range(1, sheet.max_row + 1): 
    cell = sheet.cell(row = i, column = 1) 
    print(cell.value) 

hello
new row - variant 2
None
500
600
None
= SUM(A4:A5)
None
None
None
1
4
1
4


In [41]:
cells = sheet['A1': 'B6']
for cell1, cell2 in cells:
    print(cell1.value, cell2.value)

hello world!
new row - variant 2 new col
None None
None None
None None
None None


## ADD Data to Spreadsheets

In [42]:
data = (
    (1, 2, 3),
    (4, 5, 6)
)
  
for row in data:
    sheet.append(row)
    
workbook.save(filename=f"{PATH}hello_world.xlsx")

In [61]:
data = [
    [1, 2, 3],
    [4, 5, 6]
]
  
for row in data:
    sheet.append(row)

In [43]:
print("Total Rows:", sheet.max_row)
print("Total Columns:", sheet.max_column)

Total Rows: 12
Total Columns: 6


In [44]:
sheet['A4'] = 500
sheet['A5'] = 600
sheet['A7'] = '= SUM(A4:A5)'

In [97]:
from openpyxl.utils import FORMULAE
#FORMULAE

## Managing sheets

In [85]:
workbook.sheetnames

['Sheet']

In [86]:
products_sheet = workbook["Sheet"]
products_sheet

<Worksheet "Sheet">

In [87]:
operations_sheet = workbook.create_sheet("Operations")
workbook.sheetnames

['Sheet', 'Operations']

In [88]:
# You can also define the position to create the sheet at
hr_sheet = workbook.create_sheet("HR", 0)
workbook.sheetnames

['HR', 'Sheet', 'Operations']

In [89]:
# To remove them, just pass the sheet as an argument to the .remove()
workbook.remove(operations_sheet)
workbook.sheetnames

['HR', 'Sheet']

In [90]:
workbook.copy_worksheet(products_sheet)
workbook.sheetnames

['HR', 'Sheet', 'Sheet Copy']

## Managing cells, rows, columns

In [82]:
# merge cells
sheet.merge_cells('B3:D5')   
sheet.cell(row = 3, column = 2).value = '9 cells join together.'

In [83]:
# unmerge cells
sheet.unmerge_cells('B3:D5') 

In [84]:
# insert / delete
# Insert a column before the existing column 1 ("A")
sheet.insert_cols(idx=1)
# Insert 5 columns between column 2 ("B") and 3 ("C")
sheet.insert_cols(idx=3, amount=5)
# Delete the created columns
sheet.delete_cols(idx=3, amount=5)
sheet.delete_cols(idx=1)
# Insert a new row in the beginning
sheet.insert_rows(idx=1)
# Insert 3 new rows in the beginning
sheet.insert_rows(idx=1, amount=3)
# Delete the first 4 rows
sheet.delete_rows(idx=1, amount=4)
for i in range(1, sheet.max_row + 1): 
    cell = sheet.cell(row = i, column = 1) 
    print(cell.value) 

hello
new row - variant 2
None
500
600
None
= SUM(A4:A5)
None
None
None
1
4
1
4


In [91]:
# freeze rows
sheet = workbook.active
sheet.freeze_panes = "C2"

In [98]:
# add auto filter
sheet=workbook["Sheet"]
print(sheet.dimensions)
sheet.auto_filter.ref = "A1:C14"

A1:C14


In [99]:
# add styles
from openpyxl.styles import Font, Color, Alignment, Border, Side

# Create a few styles
bold_font = Font(bold=True)
big_red_text = Font(color="00FF0000", size=20)
center_aligned_text = Alignment(horizontal="center")
double_border_side = Side(border_style="double")
square_border = Border(top=double_border_side,
                       right=double_border_side,
                       bottom=double_border_side,
                       left=double_border_side)
# Style some cells!
sheet["A2"].font = bold_font
sheet["A3"].font = big_red_text
sheet["A4"].alignment = center_aligned_text
sheet["A5"].border = square_border
workbook.save(filename=f"{PATH}hello_world.xlsx")

In [100]:
# add styles - way 2
from openpyxl.styles import NamedStyle
# Let's create a style template for the header row
header = NamedStyle(name="header")
header.font = Font(bold=True)
header.border = Border(bottom=Side(border_style="thin"))
header.alignment = Alignment(horizontal="center", vertical="center")

# Now let's apply this to all first row (header) cells
header_row = sheet[1]
for cell in header_row:
    cell.style = header
workbook.save(filename=f"{PATH}hello_world.xlsx")

In [101]:
# Conditional Formatting
from openpyxl.styles import PatternFill
from openpyxl.styles.differential import DifferentialStyle
from openpyxl.formatting.rule import Rule

red_background = PatternFill(fgColor="00FF0000")
diff_style = DifferentialStyle(fill=red_background)
rule = Rule(type="expression", dxf=diff_style)
rule.formula = ["$H1<3"]
sheet.conditional_formatting.add("A1:O100", rule)
workbook.save(filename=f"{PATH}hello_world.xlsx")

In [102]:
# ColorScale

from openpyxl.formatting.rule import ColorScaleRule
color_scale_rule = ColorScaleRule(start_type="min",
                                  start_color="00FF0000",  # Red
                                  end_type="max",
                                  end_color="0000FF00")  # Green

# Again, let's add this gradient to the star ratings, column "H"
sheet.conditional_formatting.add("H2:H100", color_scale_rule)

In [103]:
# ColorScale 2
color_scale_rule = ColorScaleRule(start_type="num",
                                  start_value=1,
                                  start_color="00FF0000",  # Red
                                  mid_type="num",
                                  mid_value=3,
                                  mid_color="00FFFF00",  # Yellow
                                  end_type="num",
                                  end_value=5,
                                  end_color="0000FF00")  # Green

# Again, let's add this gradient to the star ratings, column "H"
sheet.conditional_formatting.add("H2:H100", color_scale_rule)

In [104]:
# IconSet 
# https://openpyxl.readthedocs.io/en/stable/formatting.html#iconset
from openpyxl.formatting.rule import IconSetRule

icon_set_rule = IconSetRule("5Arrows", "num", [1, 2, 3, 4, 5])
sheet.conditional_formatting.add("H2:H100", icon_set_rule)

In [105]:
# DataBar
from openpyxl.formatting.rule import DataBarRule

data_bar_rule = DataBarRule(start_type="num",
                            start_value=1,
                            end_type="num",
                            end_value="5",
                            color="0000FF00")  # Green
sheet.conditional_formatting.add("H2:H100", data_bar_rule)

## Adding charts

In [107]:
from openpyxl.chart import BarChart, Reference
operations_sheet = workbook.create_sheet("Operations")

rows = [
    ["Product", "Online", "Store"],
    [1, 30, 45],
    [2, 40, 30],
    [3, 40, 25],
    [4, 50, 30],
    [5, 30, 25],
    [6, 25, 35],
    [7, 20, 40],
]

for row in rows:
    operations_sheet.append(row)

In [111]:
chart = BarChart()
data = Reference(worksheet=operations_sheet,
                 min_row=1,
                 max_row=8,
                 min_col=2,
                 max_col=3)

chart.add_data(data, titles_from_data=True)
operations_sheet.add_chart(chart, "E2")

In [109]:
import random
from openpyxl.chart import LineChart, Reference

line_sheet = workbook.create_sheet("Lines")
rows = [
    ["", "January", "February", "March", "April",
    "May", "June", "July", "August", "September",
     "October", "November", "December"],
    [1, ],
    [2, ],
    [3, ],
]

for row in rows:
    line_sheet.append(row)
    
for row in line_sheet.iter_rows(min_row=2,
                           max_row=4,
                           min_col=2,
                           max_col=13):
    for cell in row:
        cell.value = random.randrange(5, 100)


In [117]:
chart = LineChart()
data = Reference(worksheet=line_sheet,
                 min_row=2,
                 max_row=4,
                 min_col=1,
                 max_col=13)

chart.add_data(data, from_rows=True, titles_from_data=True)
cats = Reference(worksheet=line_sheet,
                 min_row=1,
                 max_row=1,
                 min_col=2,
                 max_col=13)
chart.set_categories(cats)
chart.x_axis.title = "Months"
chart.y_axis.title = "Sales (per unit)"
# You can play with this by choosing any number between 1 and 48
chart.style = 24
line_sheet.add_chart(chart, "C6")

workbook.save(filename=f"{PATH}hello_world.xlsx")

In [114]:
chart = LineChart()
data = Reference(worksheet=line_sheet,
                 min_row=2,
                 max_row=4,
                 min_col=1,
                 max_col=13)

chart.add_data(data, from_rows=False, titles_from_data=True)
line_sheet.add_chart(chart, "C21")

workbook.save(filename=f"{PATH}hello_world.xlsx")

## Iteration cells

In [50]:
for row in sheet.iter_rows(min_row=1,max_row=2,min_col=1,max_col=3):
    print(row)

(<Cell 'Sheet'.A1>, <Cell 'Sheet'.B1>, <Cell 'Sheet'.C1>)
(<Cell 'Sheet'.A2>, <Cell 'Sheet'.B2>, <Cell 'Sheet'.C2>)


In [51]:
for column in sheet.iter_cols(min_row=1,max_row=2,min_col=1,max_col=3):
    print(column)

(<Cell 'Sheet'.A1>, <Cell 'Sheet'.A2>)
(<Cell 'Sheet'.B1>, <Cell 'Sheet'.B2>)
(<Cell 'Sheet'.C1>, <Cell 'Sheet'.C2>)


In [52]:
# get value
for row in sheet.iter_rows(min_row=1,max_row=2,min_col=1,max_col=3,values_only=True):
    print(row)

('hello', 'world!', None)
('new row - variant 2', 'new col', None)


In [53]:
# all rows or columns
for row in sheet.rows:
    print(row)

(<Cell 'Sheet'.A1>, <Cell 'Sheet'.B1>, <Cell 'Sheet'.C1>, <Cell 'Sheet'.D1>, <Cell 'Sheet'.E1>, <Cell 'Sheet'.F1>)
(<Cell 'Sheet'.A2>, <Cell 'Sheet'.B2>, <Cell 'Sheet'.C2>, <Cell 'Sheet'.D2>, <Cell 'Sheet'.E2>, <Cell 'Sheet'.F2>)
(<Cell 'Sheet'.A3>, <Cell 'Sheet'.B3>, <Cell 'Sheet'.C3>, <Cell 'Sheet'.D3>, <Cell 'Sheet'.E3>, <Cell 'Sheet'.F3>)
(<Cell 'Sheet'.A4>, <Cell 'Sheet'.B4>, <Cell 'Sheet'.C4>, <Cell 'Sheet'.D4>, <Cell 'Sheet'.E4>, <Cell 'Sheet'.F4>)
(<Cell 'Sheet'.A5>, <Cell 'Sheet'.B5>, <Cell 'Sheet'.C5>, <Cell 'Sheet'.D5>, <Cell 'Sheet'.E5>, <Cell 'Sheet'.F5>)
(<Cell 'Sheet'.A6>, <Cell 'Sheet'.B6>, <Cell 'Sheet'.C6>, <Cell 'Sheet'.D6>, <Cell 'Sheet'.E6>, <Cell 'Sheet'.F6>)
(<Cell 'Sheet'.A7>, <Cell 'Sheet'.B7>, <Cell 'Sheet'.C7>, <Cell 'Sheet'.D7>, <Cell 'Sheet'.E7>, <Cell 'Sheet'.F7>)
(<Cell 'Sheet'.A8>, <Cell 'Sheet'.B8>, <Cell 'Sheet'.C8>, <Cell 'Sheet'.D8>, <Cell 'Sheet'.E8>, <Cell 'Sheet'.F8>)
(<Cell 'Sheet'.A9>, <Cell 'Sheet'.B9>, <Cell 'Sheet'.C9>, <Cell 'Sheet'.D9>, <Ce

# ADVANCED Function

## Convert data from .parquet to Excel file

In [68]:
def from_parquet_to_excel(file_parquet,file_excel):
    df=pd.read_parquet(file_parquet)
    print(df.shape)
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    for row in dataframe_to_rows(df, index=False, header=True):
        sheet.append(row)
    workbook.save(filename=file_excel)
    print("Total Rows:", sheet.max_row)
    print("Total Columns:", sheet.max_column)
    print(f"file parquet size, mb: {round(os.path.getsize(file_parquet)/(1024*1024),1)}")
    print(f"file excel size, mb: {round(os.path.getsize(file_excel)/(1024*1024),1)}")

In [69]:
%%time
file_parquet=f'../../data/olist/olist_order_items_dataset.parquet'
file_excel=f"{PATH}olist_order_items.xlsx"
_ = from_parquet_to_excel(file_parquet,file_excel)

(112650, 7)
Total Rows: 112651
Total Columns: 7
file parquet size, mb: 6.3
file excel size, mb: 9.9
CPU times: total: 14.5 s
Wall time: 14.5 s


In [119]:
%%time
file_parquet=f'../../data/olist/olist_products_dataset.parquet'
file_excel=f"{PATH}olist_products.xlsx"
_ = from_parquet_to_excel(file_parquet,file_excel)

(32951, 9)
Total Rows: 32952
Total Columns: 9
file parquet size, mb: 1.4
file excel size, mb: 2.0
CPU times: total: 6 s
Wall time: 5.99 s


## Convert data from Excel file to pandas dataframe

In [76]:
def from_excel_to_df(file_excel):
    workbook = openpyxl.load_workbook(filename=file_excel)
    sheet = workbook.active
    values = sheet.values
    df = pd.DataFrame(values)
    df.columns = df.iloc[0] 
    df = df[1:]
    return df

In [77]:
%%time
file_excel=f"{PATH}olist_order_items.xlsx"
df=from_excel_to_df(file_excel)
print(df.shape)
df.head()

(112650, 7)
CPU times: total: 16.3 s
Wall time: 16.3 s


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
1,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
2,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
3,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
4,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
5,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14


In [133]:
%%time
file_excel=f"{PATH}olist_products.xlsx"
df_prod=from_excel_to_df(file_excel)
print(df_prod.shape)
df_prod.head()

(32951, 9)
CPU times: total: 5.36 s
Wall time: 5.37 s


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
1,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40,287,1,225,16,10,14
2,3aa071139cb16b67ca9e5dea641aaa2f,artes,44,276,1,1000,30,18,20
3,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46,250,1,154,18,9,15
4,cef67bcfe19066a932b7673e239eb23d,bebes,27,261,1,371,26,4,26
5,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37,402,4,625,20,17,13


## USE DataClasses for get data from Excel

### STEP 1. Create 'classes.py'

In [166]:
import datetime
from dataclasses import dataclass

@dataclass
class OrderItem:
    order_id: str
    order_item_id: int
    product_id: str
    seller_id: str
    shipping_limit_date: datetime.datetime
    price: float
    freight_value: float

@dataclass
class Product:
    product_id: str
    product_category_name: str
    product_name_lenght: int
    product_description_lenght: int
    product_photos_qty: int
    product_weight_g: int
    product_length_cm: int
    product_height_cm: int
    product_width_cm: int

### STEP 2. Create 'mapping.py' (column location (zero-indexed) on the spreadsheet)

In [153]:
# OrderItems:
ORDIT_ORDER_ID = 0
ORDIT_ORDER_ITEM_ID = 1
ORDIT_PRODUCT_ID = 2
ORDIT_SELLER_ID = 3
ORDIT_SHIPPING_LIMIT_DATE = 4
ORDIT_PRICE = 5
ORDIT_FREIGHT_VALUE = 6

# Products:
PROD_PRODUCT_ID = 0
PROD_PRODUCT_CATEGORY_NAME = 1
PROD_PRODUCT_NAME_LENGHT = 2
PROD_PRODUCT_DESCRIPTION_LENGHT = 3
PROD_PRODUCT_PHOTOS_QTY = 4
PROD_PRODUCT_WEIGHT_G = 5
PROD_PRODUCT_LENGTH_CM = 6
PROD_PRODUCT_HEIGHT_CM = 7
PROD_PRODUCT_WIDTH_CM = 8

### STEP 3. Get data from Excel to Dataclasses

In [164]:
%%time
# this block uses script py
#from classes import OrderItem, Product
#from mapping import ORDIT_ORDER_ID, ORDIT_ORDER_ITEM_ID, ORDIT_PRODUCT_ID, ORDIT_SELLER_ID, ORDIT_SHIPPING_LIMIT_DATE, ORDIT_PRICE, ORDIT_FREIGHT_VALUE
#from mapping import PROD_PRODUCT_ID, PROD_PRODUCT_CATEGORY_NAME, PROD_PRODUCT_NAME_LENGHT, PROD_PRODUCT_DESCRIPTION_LENGHT, PROD_PRODUCT_PHOTOS_QTY
#from mapping import PROD_PRODUCT_WEIGHT_G, PROD_PRODUCT_LENGTH_CM, PROD_PRODUCT_HEIGHT_CM, PROD_PRODUCT_WIDTH_CM

file_excel=f"{PATH}olist_order_items.xlsx"
workbook = openpyxl.load_workbook(filename=file_excel)
sheet = workbook.active

orders_items = []

# Using the values_only because you just want to return the cell value
for row in sheet.iter_rows(min_row=2, values_only=True):
    item = OrderItem(order_id=row[ORDIT_ORDER_ID],
                     order_item_id=row[ORDIT_ORDER_ITEM_ID],
                     product_id=row[ORDIT_PRODUCT_ID],
                     seller_id=row[ORDIT_SELLER_ID],
                     shipping_limit_date=row[ORDIT_SHIPPING_LIMIT_DATE],
                     price=row[ORDIT_PRICE],
                     freight_value=row[ORDIT_FREIGHT_VALUE])
    orders_items.append(item)
df_ord_class=pd.DataFrame(orders_items)
print(df_ord_class.shape)
df_ord_class.head()

(112650, 7)
CPU times: total: 18 s
Wall time: 18.1 s


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14


In [167]:
%%time
# this block uses script py
#from classes import OrderItem, Product
#from mapping import ORDIT_ORDER_ID, ORDIT_ORDER_ITEM_ID, ORDIT_PRODUCT_ID, ORDIT_SELLER_ID, ORDIT_SHIPPING_LIMIT_DATE, ORDIT_PRICE, ORDIT_FREIGHT_VALUE
#from mapping import PROD_PRODUCT_ID, PROD_PRODUCT_CATEGORY_NAME, PROD_PRODUCT_NAME_LENGHT, PROD_PRODUCT_DESCRIPTION_LENGHT, PROD_PRODUCT_PHOTOS_QTY
#from mapping import PROD_PRODUCT_WEIGHT_G, PROD_PRODUCT_LENGTH_CM, PROD_PRODUCT_HEIGHT_CM, PROD_PRODUCT_WIDTH_CM

file_excel=f"{PATH}olist_products.xlsx"
workbook = openpyxl.load_workbook(filename=file_excel)
sheet = workbook.active

products = []

# Using the values_only because you just want to return the cell value
for row in sheet.iter_rows(min_row=2, values_only=True):
    item = Product(product_id=row[PROD_PRODUCT_ID],
                     product_category_name=row[PROD_PRODUCT_CATEGORY_NAME],
                     product_name_lenght=row[PROD_PRODUCT_NAME_LENGHT],
                     product_description_lenght=row[PROD_PRODUCT_DESCRIPTION_LENGHT],
                     product_photos_qty=row[PROD_PRODUCT_PHOTOS_QTY],
                     product_weight_g=row[PROD_PRODUCT_WEIGHT_G],
                     product_length_cm=row[PROD_PRODUCT_LENGTH_CM],
                     product_height_cm=row[PROD_PRODUCT_HEIGHT_CM],
                     product_width_cm=row[PROD_PRODUCT_WIDTH_CM])
    products.append(item)
df_prod_class=pd.DataFrame(products)
print(df_prod_class.shape)
df_prod_class.head()

(32951, 9)
CPU times: total: 6.52 s
Wall time: 6.54 s


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


# USE CASES

1. Excel - витрина данных
    - НЕ используется для внесения данных
    - выступает в роли ПРОСТОЙ витрины данных: получил файл, посмотрел данные (графики, таблицы и т.д.), минимально их "покрутил" в этом файле и "забыл", потому что в следующий раз получит такой же файл с обновленными данными.
    - файл может как класться в определенное место, так и отправляться по почте
    - этот сценарий подходит, когда есть много разрозненных данных, сложных данных, больших и сложных таблиц, данные надо взять с баз данных, не хочется связываться пока с Power BI (т.е. нужны просто РЕГУЛЯРНЫЕ простые отчеты)
    - при этом можно создавать кучу листов, на каждом своя таблица и визуализация.
    - этот сценарий хорош, чтобы каждому рассылаться только его данные (т.е. минимизируем таблицу, а также имеем сразу фильтр на доступ только к определенной инфе).
    
2. Excel - база данных
    - в один или несколько файлов вносятся данные 
    - а также они могут обновляться (например, корректировка запасов)
    - как правило, такие файлы длинные и выглядят они как добавляемые строки со стандартным набором столбцов
    - в них могут быть отдельные листы со справочниками, из которых потом подтягиваются данные

3. Excel - как транзитный формат
    - например ситуация, когда мы не можем напрямую забрать данные из источника, но источник может их выгрузить в файл Excel и прислать куда надо
    - и тогда наша задача эту инфу считать и положить в нужную базу данных
    
4. Excel - как форма ввода данных (интерфейс)
    - скорее всего здесь будет открытый лист - с формой ввода
    - и скрытые для пользователя листы, где сохраняются данные

5. Excel - как сложный большой расчетный файл
    - например, файл с расчетом бюджета.
    - в результате появляются задачи: 
        - свести в одном месте и проанализировать разные версии / варианты бюджета
        - добавить фактические данные (как для анализа бюджета, так и как исходные данные для его расчета)
    - эти задачи вручную делать сложно. А также сложно поддерживать формулы (особенно, когда добавляются разные новые подкатегории, категории, колонки и т.д.)
    - при этом сам набор ЛОГИКИ он не сложный и его можно описать как некий MAPPING
    - и здесь мы можем создавать два варианта решений:
        - просто генерация такого бюджета как ФИНАЛЬНАЯ ТАБЛИЦА (т.е. без формул, чтобы просто смотреть). В этом случае вся логика расчета делается в более простых и структурированных таблицах, в т.ч.в виде расчетов в базе данных (например, отдельно считаем прогноз продаж, отдельно прогноз HR-расходов)
        - ведущая роль у таблицы с ее формулами, а в Python добавляем возможности быстро ее анализировать и преобрзовывать в плоские таблицы и т.д.