In [66]:
import pandas as pd
import sqlalchemy as sa
from math import ceil

### data extract helper function

In [44]:
def get_engine(svr,db):
    """given server and db, return sqlalchemy engine"""

    constr = 'mssql+pyodbc://{}/{}?'.format(svr,db)
    constr += 'driver=SQL+Server+Native+Client+11.0?'
    constr += 'trusted_connection=yes'
    
    return sa.create_engine(constr)

### get main data set

In [45]:
# get sqlalchemy engine
engine = get_engine('EUSETL','ETLDB')

In [46]:
df = pd.read_sql_table('ZenGAProject',con=engine)

In [47]:
df.dtypes

DBKey                        int64
BUS_DAT             datetime64[ns]
DOC_ID                       int64
PMT_SEQ_NO                   int64
CustomerKey                  int64
VisitNumber                  int64
VisitCount                   int64
TicketDate          datetime64[ns]
TicketTime                  object
PriorVisits                  int64
SaleLines                    int64
ReturnLines                  int64
GiftCardLines                int64
NetAmount                  float64
NetRetailAmount            float64
NetQSRAmount               float64
DiscountAmount             float64
StationGroup                object
UniqueItems                  int64
UniqueCategories             int64
ReturnedBags                  bool
BoughtProduce                 bool
WillReturn                    bool
dtype: object

In [48]:
df.drop(['DBKey','BUS_DAT','DOC_ID','PMT_SEQ_NO','VisitCount'], axis=1, inplace=True)

In [49]:
df.shape

(1621854, 18)

In [18]:
df.sample(3)

Unnamed: 0,CustomerKey,VisitNumber,TicketDate,TicketTime,PriorVisits,SaleLines,ReturnLines,GiftCardLines,NetAmount,NetRetailAmount,NetQSRAmount,DiscountAmount,StationGroup,UniqueItems,UniqueCategories,ReturnedBags,BoughtProduce,WillReturn
1235163,351072,31,2015-11-17,19:32:23,30,4,0,0,17.11,17.11,0.0,0,Other,4,4,False,False,True
973568,221561,6,2015-07-03,13:27:51,5,28,0,0,91.7,91.7,0.0,0,Front End,28,6,False,True,True
753483,223659,1,2015-04-03,19:56:01,0,1,0,0,4.5,0.0,4.5,0,Other,1,1,False,False,False


In [96]:
# export data into smaller files for github
step = 50000
for r in range(ceil(len(df) / step)):
    r_low = r*step
    r_high = (r+1)*step
    
    filename = 'data/maindata_'+str(r).zfill(3)+'.csv'
    
    df.iloc[r_low:r_high].to_csv(filename, index=None)

### extract ticket counts by hour

In [39]:
# construct engine
engine = get_engine('EUSETL','CPSQL')

In [40]:
# get sql query from text
with open('sql/tkt_count.sql') as fobj:
    sqltxt = fobj.read()

In [41]:
dftktct = pd.read_sql_query(sql=sqltxt, con=engine, index_col=None)

In [42]:
dftktct.head()

Unnamed: 0,TicketDate,Hr,TicketCount
0,2014-01-30,10,48
1,2015-07-21,16,357
2,2015-04-26,10,131
3,2014-04-26,16,459
4,2015-03-02,21,104


In [93]:
dftktct.to_csv('data/tkt_count.csv', index=None)