# Automatización en Python para Excel

Primero importamos las librerías necesarias

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [80]:
import openpyxl

## Convertir un JSON a un libro de excel

In [81]:
df = pd.read_json("http://api.nobelprize.org/v1/prize.json")
df

Unnamed: 0,prizes
0,"{'year': '2017', 'category': 'physics', 'laure..."
1,"{'year': '2017', 'category': 'chemistry', 'lau..."
2,"{'year': '2017', 'category': 'medicine', 'laur..."
3,"{'year': '2017', 'category': 'literature', 'la..."
4,"{'year': '2017', 'category': 'peace', 'laureat..."
5,"{'year': '2017', 'category': 'economics', 'lau..."
6,"{'year': '2016', 'category': 'physics', 'laure..."
7,"{'year': '2016', 'category': 'chemistry', 'lau..."
8,"{'year': '2016', 'category': 'medicine', 'laur..."
9,"{'year': '2016', 'category': 'literature', 'la..."


In [82]:
df2 = df.to_dict()

In [83]:
df2['prizes'][584]

{'category': 'peace',
 'laureates': [{'firstname': 'Jean Henry',
   'id': '462',
   'share': '2',
   'surname': 'Dunant'},
  {'firstname': 'Frédéric', 'id': '463', 'share': '2', 'surname': 'Passy'}],
 'year': '1901'}

In [84]:
len(df2['prizes'])

585

In [85]:
pd.DataFrame(df2['prizes'][0]['laureates'])

Unnamed: 0,firstname,id,motivation,share,surname
0,Rainer,941,"""for decisive contributions to the LIGO detect...",2,Weiss
1,Barry C.,942,"""for decisive contributions to the LIGO detect...",4,Barish
2,Kip S.,943,"""for decisive contributions to the LIGO detect...",4,Thorne


In [86]:
nobel = pd.DataFrame()
for prize in range(0,len(df2['prizes'])):
    laureates = pd.DataFrame(df2['prizes'][prize]['laureates'])
    laureates['category'] = df2['prizes'][prize]['category']
    laureates['year'] = df2['prizes'][prize]['year']
    nobel = nobel.append(laureates,ignore_index=True) 

In [87]:
nobel.head()

Unnamed: 0,category,firstname,id,motivation,share,surname,year
0,physics,Rainer,941,"""for decisive contributions to the LIGO detect...",2,Weiss,2017
1,physics,Barry C.,942,"""for decisive contributions to the LIGO detect...",4,Barish,2017
2,physics,Kip S.,943,"""for decisive contributions to the LIGO detect...",4,Thorne,2017
3,chemistry,Jacques,944,"""for developing cryo-electron microscopy for t...",3,Dubochet,2017
4,chemistry,Joachim,945,"""for developing cryo-electron microscopy for t...",3,Frank,2017


In [88]:
nobel.tail()

Unnamed: 0,category,firstname,id,motivation,share,surname,year
918,chemistry,Jacobus Henricus,160,"""in recognition of the extraordinary services ...",1,van 't Hoff,1901
919,medicine,Emil Adolf,293,"""for his work on serum therapy, especially its...",1,von Behring,1901
920,literature,Sully,569,"""in special recognition of his poetic composit...",1,Prudhomme,1901
921,peace,Jean Henry,462,,2,Dunant,1901
922,peace,Frédéric,463,,2,Passy,1901


In [89]:
nobel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923 entries, 0 to 922
Data columns (total 7 columns):
category      923 non-null object
firstname     923 non-null object
id            923 non-null object
motivation    835 non-null object
share         923 non-null object
surname       923 non-null object
year          923 non-null object
dtypes: object(7)
memory usage: 50.6+ KB


In [90]:
columns = ['id','share','year']
for col in columns:
    nobel[col] = pd.to_numeric(nobel[col])

In [91]:
nobel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923 entries, 0 to 922
Data columns (total 7 columns):
category      923 non-null object
firstname     923 non-null object
id            923 non-null int64
motivation    835 non-null object
share         923 non-null int64
surname       923 non-null object
year          923 non-null int64
dtypes: int64(3), object(4)
memory usage: 50.6+ KB


In [92]:
from openpyxl.utils.dataframe import dataframe_to_rows

In [93]:
wb = openpyxl.Workbook()
ws = wb.active

In [94]:
for r in dataframe_to_rows(nobel, index=False, header=True):
    ws.append(r)

In [95]:
for cell in ws[1]:
    cell.style = 'Pandas'

In [96]:
wb.save("pandas_nobel.xlsx")

## Cargar DataFrame desde Excel

In [97]:
wb_votacion_2006 = openpyxl.load_workbook("votacion_pres_2006.xlsx")

In [98]:
print(wb_votacion_2006.get_sheet_names())

['votacion_pres_2006']


In [99]:
ws_votacion_2006 = wb_votacion_2006['votacion_pres_2006']

In [100]:
data = ws_votacion_2006.values
cols = next(data)

In [101]:
df_vot_2006 = pd.DataFrame(data=data, columns=cols)
df_vot_2006.head()

Unnamed: 0,id_estado,distrito,seccion,id_casilla,tipo_casilla,ext_contigua,tipo_candidatura,tipo_acta,lista_nominal,no_votos_nulos,...,orden,pan,apm,pbt,na,asdc,municipio,paquete_entregado,casilla_instalada,fecha_hora
0,1,1,0,1,B,0,6,1,120,0,...,1,53,4,18,0,1,0,0,0,2006-05-07 23:22:10
1,1,1,338,1,B,0,1,1,689,12,...,1,202,45,65,12,12,2,1,1,2006-05-07 08:17:05
2,1,1,338,1,C,0,1,1,689,7,...,2,175,38,56,3,8,2,1,1,2006-05-07 08:22:24
3,1,1,339,1,B,0,1,1,630,0,...,3,165,50,48,7,13,2,1,1,2006-05-07 08:24:47
4,1,1,339,1,C,0,1,1,631,6,...,4,174,64,32,9,7,2,1,1,2006-05-07 08:25:03


In [102]:
df_vot_2006.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130788 entries, 0 to 130787
Data columns (total 23 columns):
id_estado            130788 non-null int64
distrito             130788 non-null int64
seccion              130788 non-null int64
id_casilla           130788 non-null int64
tipo_casilla         130788 non-null object
ext_contigua         130788 non-null int64
tipo_candidatura     130788 non-null int64
tipo_acta            130788 non-null int64
lista_nominal        130788 non-null int64
no_votos_nulos       130788 non-null int64
no_votos_can_nreg    130788 non-null int64
no_votos_validos     130788 non-null int64
total_votos          130788 non-null int64
orden                130788 non-null int64
pan                  130788 non-null int64
apm                  130788 non-null int64
pbt                  130788 non-null int64
na                   130788 non-null int64
asdc                 130788 non-null int64
municipio            130788 non-null int64
paquete_entregado    130788 

## Estudio de Caso

In [103]:
df_vot_2006['prop_vot']= (df_vot_2006['total_votos']/df_vot_2006['lista_nominal']*100)
df_vot_2006['prop_vot']

0         63.333333
1         51.233672
2         43.251089
3         44.920635
4         47.068146
5         57.166392
6         53.377265
7         62.688822
8         59.276018
9         58.168761
10        49.910233
11        62.745098
12        61.064426
13        64.475524
14        49.183007
15        57.748777
16        50.407830
17        47.368421
18        45.161290
19        50.254669
20        62.889518
21        49.433428
22        57.819905
23        55.292259
24        59.621451
25        54.188948
26        50.267380
27        50.000000
28        47.766323
29        50.943396
            ...    
130758    50.911854
130759    61.456103
130760    58.886510
130761    52.025586
130762    55.650320
130763    56.660040
130764    59.443340
130765    65.042980
130766    55.696203
130767    63.988095
130768    51.550388
130769    31.016043
130770    54.030501
130771    54.484305
130772    48.373102
130773    59.782609
130774    51.627907
130775    53.023256
130776    61.538462


In [104]:
df_vot_2006['prop_nulos']= (df_vot_2006['no_votos_nulos']/df_vot_2006['total_votos']*100)
df_vot_2006['prop_nulos']

0         0.000000
1         3.399433
2         2.348993
3         0.000000
4         2.020202
5         2.305476
6         3.703704
7         2.168675
8         2.035623
9         2.160494
10        1.438849
11        3.571429
12        3.211009
13        2.603037
14        2.325581
15        2.259887
16        1.941748
17        1.075269
18        1.879699
19        6.081081
20        1.801802
21        3.724928
22        3.551913
23        3.142857
24        1.851852
25        2.960526
26        3.900709
27        0.687285
28        2.517986
29        2.693603
            ...   
130758    1.194030
130759    2.090592
130760    2.545455
130761    2.459016
130762    1.149425
130763    2.807018
130764    2.341137
130765    0.000000
130766    5.303030
130767    1.860465
130768    2.255639
130769    5.172414
130770    2.419355
130771    2.880658
130772    3.587444
130773    4.242424
130774    1.351351
130775    3.070175
130776    1.602564
130777    1.501502
130778    0.000000
130779    8.

In [105]:
df_mas_del_10_nulos = df_vot_2006[df_vot_2006['prop_nulos']>10]
df_mas_del_10_nulos['prop_nulos']

2421      10.465116
2431      10.373444
2845      10.273973
3944      14.716981
4115      50.000000
4198      11.217949
4837      10.294118
5072      13.333333
5607      46.562500
5623      11.614731
5791      11.500975
5799      11.497326
5901      10.218978
6212      10.028653
6260      12.169312
6295      10.216718
6822      11.711712
7053      10.661765
7426      50.000000
7739      12.500000
7790      11.787072
8348      12.000000
10263     14.210526
10434     22.421525
10459     16.793893
10471     43.502825
10489     10.727969
10491     14.583333
10494     12.169312
10557     13.478261
            ...    
125974    10.505837
126086    10.823529
126186    11.572052
127404    41.184388
127599    24.260355
127965    11.370262
128041    13.966480
128141    15.873016
128605    10.493827
128670    10.294118
128762    10.638298
128932    11.320755
129019    11.111111
129089    12.213740
129092    16.071429
129166    10.169492
129217    11.290323
129312    12.121212
129367    10.679612


In [106]:
len(df_mas_del_10_nulos['prop_nulos'])

1348

In [107]:
partidos = ['pan','pbt','apm','total_votos']

In [108]:
res_2006 = {partido: [df_vot_2006[partido].sum(),round(df_vot_2006[partido].sum()/df_vot_2006['total_votos'].sum()*100,2)] for partido in partidos}
res_2006

{'apm': [9301441, 22.26],
 'pan': [15000284, 35.89],
 'pbt': [14756350, 35.31],
 'total_votos': [41791322, 100.0]}

In [109]:
for partido in partidos:
    print( partido.upper() + " " + str(res_2006[partido][0]) + " " +  str(res_2006[partido][1]))

PAN 15000284 35.89
PBT 14756350 35.31
APM 9301441 22.26
TOTAL_VOTOS 41791322 100.0


In [110]:
res_2006_10mas = {partido: [df_mas_del_10_nulos[partido].sum(),round(df_mas_del_10_nulos[partido].sum()/df_mas_del_10_nulos['total_votos'].sum()*100,2)] for partido in partidos}
res_2006_10mas

{'apm': [100698, 32.55],
 'pan': [70652, 22.84],
 'pbt': [81528, 26.36],
 'total_votos': [309331, 100.0]}

In [111]:
for partido in partidos:
    print( partido.upper() + " " + str(res_2006_10mas[partido][0]) + " " +  str(res_2006_10mas[partido][1]))

PAN 70652 22.84
PBT 81528 26.36
APM 100698 32.55
TOTAL_VOTOS 309331 100.0


In [112]:
df_vot_2006['pan-pbt'] = df_vot_2006['pan']-df_vot_2006['pbt']

In [113]:
gran_dif = (df_vot_2006['pan-pbt']>200) | (df_vot_2006['pan-pbt']<-200)

In [114]:
df_gran_dif_2006 = df_vot_2006[gran_dif]
len(df_gran_dif_2006)

9785

In [115]:
res_gran_dif_2006 = {partido: [df_gran_dif_2006[partido].sum(),round(df_gran_dif_2006[partido].sum()/df_gran_dif_2006['total_votos'].sum()*100,2)] for partido in partidos}
res_gran_dif_2006

{'apm': [635253, 14.99],
 'pan': [1706917, 40.270000000000003],
 'pbt': [1701090, 40.140000000000001],
 'total_votos': [4238222, 100.0]}

In [116]:
for partido in partidos:
    print( partido.upper() + " " + str(res_gran_dif_2006[partido][0]) + " " +  str(res_gran_dif_2006[partido][1]))

PAN 1706917 40.27
PBT 1701090 40.14
APM 635253 14.99
TOTAL_VOTOS 4238222 100.0


In [117]:
df_alta_votacion = df_vot_2006[df_vot_2006['prop_vot']>50]
len(df_alta_votacion['prop_vot'])

101509

In [118]:
res_alta_votacion_2006 = {partido: [df_alta_votacion[partido].sum(),round(df_alta_votacion[partido].sum()/df_alta_votacion['total_votos'].sum()*100,2)] for partido in partidos}
res_alta_votacion_2006

{'apm': [7414727, 21.16],
 'pan': [12838633, 36.640000000000001],
 'pbt': [12523725, 35.740000000000002],
 'total_votos': [35041853, 100.0]}

## Gráficas en Excel

In [119]:
from openpyxl.chart import BarChart, Series, Reference

In [120]:
wb = openpyxl.Workbook(write_only=True)
ws = wb.create_sheet()

In [121]:
resultados = [res_2006,res_2006_10mas,res_gran_dif_2006,res_alta_votacion_2006]
partidos_chart = ['pan','pbt','apm']

In [122]:
data =[[round(res[partido][1],2) for partido in partidos_chart] for res in resultados] 

In [123]:
rows = [partidos_chart] + data + [['2006','Nulos','Gran Diferencia','Votación Alta']]
rows

[['pan', 'pbt', 'apm'],
 [35.89, 35.31, 22.26],
 [22.84, 26.36, 32.55],
 [40.270000000000003, 40.140000000000001, 14.99],
 [36.640000000000001, 35.740000000000002, 21.16],
 ['2006', 'Nulos', 'Gran Diferencia', 'Votación Alta']]

In [124]:
for row in rows:
    ws.append(row)

In [125]:
chart1 = BarChart()
chart1.type = "col"
chart1.style = 10
chart1.title = "Elecciones Presidenciales 2006"
chart1.y_axis.title = '% de votación'
chart1.x_axis.title = 'Escenarios'
data = Reference(ws, min_col=1, min_row=1, max_row=5, max_col=3)
cats = Reference(ws, min_col=1, min_row=6, max_row=6, max_col=4)
chart1.add_data(data, titles_from_data=True)
chart1.set_categories(cats)
chart1.shape = 4
ws.add_chart(chart1, "A10")

In [126]:
from copy import deepcopy

chart2 = deepcopy(chart1)
chart2.style = 11
chart2.type = "bar"
chart2.title = "Elecciones Presidenciales 2016"

ws.add_chart(chart2, "H10")

In [127]:
wb.save("bar.xlsx")

## Reportes de Ventas

In [401]:
df = pd.read_excel('sales_data/1_Ene2017_Sales.xlsx')
df.head()

Unnamed: 0,date,id,amount,size,gender
0,2017-01-05,1,$2759.25,XS,Female
1,2017-01-18,2,$4809.65,3XL,Female
2,2017-01-02,3,$195.69,M,Female
3,2017-01-28,4,$3569.84,L,Female
4,2017-01-28,5,$4033.22,S,Male


In [402]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
date      1000 non-null datetime64[ns]
id        1000 non-null int64
amount    1000 non-null object
size      1000 non-null object
gender    1000 non-null object
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 39.1+ KB


In [413]:
df['amount'] = df['amount'].replace('[/$,]','',regex=True).astype(float)
df['amount'].head()

0    2759.25
1    4809.65
2     195.69
3    3569.84
4    4033.22
Name: amount, dtype: float64

In [416]:
df['month'] = df['date'].dt.month
df['month'].head()

0    1
1    1
2    1
3    1
4    1
Name: month, dtype: int64

In [421]:
df.set_index(keys'date',inplace=True)
df.head()

Unnamed: 0_level_0,id,amount,size,gender,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-05,1,2759.25,XS,Female,1
2017-01-18,2,4809.65,3XL,Female,1
2017-01-02,3,195.69,M,Female,1
2017-01-28,4,3569.84,L,Female,1
2017-01-28,5,4033.22,S,Male,1


In [422]:
meses = ['Ene','Feb','Mar','Abr','May','Jun','Jul','Ago','Sep','Oct','Nov','Dic']

In [427]:
rutas = []
i = 1
for mes in meses:
    rutas.append('sales_data/'+ str(i)+'_'+ mes +'2017_Sales.xlsx')
    i += 1 

In [428]:
rutas

['sales_data/1_Ene2017_Sales.xlsx',
 'sales_data/2_Feb2017_Sales.xlsx',
 'sales_data/3_Mar2017_Sales.xlsx',
 'sales_data/4_Abr2017_Sales.xlsx',
 'sales_data/5_May2017_Sales.xlsx',
 'sales_data/6_Jun2017_Sales.xlsx',
 'sales_data/7_Jul2017_Sales.xlsx',
 'sales_data/8_Ago2017_Sales.xlsx',
 'sales_data/9_Sep2017_Sales.xlsx',
 'sales_data/10_Oct2017_Sales.xlsx',
 'sales_data/11_Nov2017_Sales.xlsx',
 'sales_data/12_Dic2017_Sales.xlsx']

In [430]:
def procesar_ventas(ruta):
    df = pd.read_excel(ruta)
    df['amount'] = df['amount'].replace('[/$,]','',regex=True).astype(float)
    df['month'] = df['date'].dt.month
    df.set_index(keys='date',inplace=True)
    return df

In [434]:
def concatenar_meses_ventas(rutas):
    df = pd.DataFrame()
    for ruta in rutas:
        df = df.append(procesar_ventas(ruta))
    return df

In [436]:
df = concatenar_meses_ventas(rutas)
df.tail()

Unnamed: 0_level_0,id,amount,size,gender,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-12-28,996,2514.05,L,Female,12
2017-12-21,997,2190.12,XL,Female,12
2017-12-10,998,905.67,M,Female,12
2017-12-13,999,2338.87,L,Male,12
2017-12-06,1000,3862.26,S,Female,12


In [438]:
df[['amount','month']].groupby('month').sum()

Unnamed: 0_level_0,amount
month,Unnamed: 1_level_1
1,2441153.08
2,2517905.46
3,2437852.15
4,2552323.58
5,2482136.35
6,2491805.34
7,2542345.27
8,2478649.75
9,2516667.2
10,2538660.16


In [442]:
df.pivot_table?

In [443]:
df.pivot_table(values='amount',index='size',columns='gender',aggfunc='sum')

gender,Female,Male
size,Unnamed: 1_level_1,Unnamed: 2_level_1
2XL,2058517.6,2075699.0
3XL,2099091.53,2076458.05
L,2097829.45,2175832.61
M,2145611.8,2090039.2
S,2361348.23,2125754.28
XL,2191027.89,2169165.42
XS,2104176.12,2242536.18


In [446]:
df['7-2017'].pivot_table(values='amount',index='size',columns='gender',aggfunc='sum')

gender,Female,Male
size,Unnamed: 1_level_1,Unnamed: 2_level_1
2XL,167154.36,203430.69
3XL,208435.31,192819.22
L,146533.5,188757.23
M,159440.5,179399.97
S,198537.25,165278.9
XL,189443.76,155746.17
XS,170243.87,217124.54


In [447]:
df.to_csv('sales2017.csv')