#### Factset data needs some prep and merging

In [9]:
from datetime  import datetime
from dateutil.relativedelta import relativedelta
from pandas.tseries.offsets import MonthBegin

from getConfig import getConfig

colMapping = []

In [10]:
config = getConfig()

In [11]:
# Keep track of the frequency of each column
def writeRec(colMapping):
    with open("/home/tbrownex/data/Hackett/Valmont/colFrequency.csv", "w") as f:
        for x in colMapping:
            rec = x[0]+"|"+x[1]+"\n"
            f.write(rec)

##### Monthly first

In [12]:
# Fix the format of the month and sort by month
m = pd.read_csv(config["dataLoc"]+"/Factset/MEImonthly.csv")

m["Date"] = pd.to_datetime(m["Date"], format='%y-%b')
m.set_index("Date", inplace=True)
m.sort_index(inplace=True)

for col in m:
    tup = (col, "M")
    colMapping.append(tup)

##### Quarterly

In [13]:
# Fix the format of the month and sort by month
q = pd.read_csv(config["dataLoc"]+"/Factset/MEIquarterly.txt", sep="\t", thousands=r',')

q["Date"] = pd.to_datetime(q["Date"], format='%y-%b')

q.set_index("Date", inplace=True)
q.sort_index(inplace=True)

# Create a dummy last record, so we can get the two months after the end of the data
# For example, if Sept 2018 was the last row, create Dec 2018 so resampling will generate
# Oct and Nov. Then delete the Dec dummy row
lastQ = q.index.values.max()
lastQ = pd.to_datetime(lastQ)

offset = relativedelta(months=3)
nextQ = lastQ + offset

q.loc[nextQ] = None   # Create the dummy row

# Fill in the months between quarters; use last known value "ffill"
q = q.resample('M').ffill()
q.reset_index(inplace=True)

q = q.iloc[:-1]         # Delete the dummy row

# "resample" for some reason switches the dates to end-of-month; put them back to start-of-month
q['Date'] = pd.to_datetime(q['Date']) - MonthBegin(1)

q.set_index("Date", inplace=True)

for col in q:
    tup = (col, "Q")
    colMapping.append(tup)

##### Weekly

In [14]:
# Fix the format of the month and sort by month
w = pd.read_csv(config["dataLoc"]+"/Factset/MEIweekly.csv", parse_dates=True)

w["Date"] = pd.to_datetime(w["Date"])

w.set_index("Date", inplace=True)
w.sort_index(inplace=True)

# Group the weeks into months to match the others
w = w.resample('M').sum()
w.reset_index(inplace=True)

# "resample" for some reason switches the dates to end-of-month; put them back to start-of-month
w['Date'] = pd.to_datetime(w['Date']) - MonthBegin(1)

w.set_index("Date", inplace=True)

for col in w:
    tup = (col, "W")
    colMapping.append(tup)

##### Daily

In [15]:
# Fix the format of the month and sort by month
d = pd.read_csv(config["dataLoc"]+"/Factset/MEIdaily.csv", parse_dates=True)

d["Date"] = pd.to_datetime(d["Date"])

d.set_index("Date", inplace=True)
d.sort_index(inplace=True)

# Group the days into months to match the others
d = d.resample('M').last()
d.reset_index(inplace=True)

# "resample" for some reason switches the dates to end-of-month; put them back to start-of-month
d['Date'] = pd.to_datetime(d['Date']) - MonthBegin(1)

d.set_index("Date", inplace=True)
d = d.shift(1)

for col in d:
    tup = (col, "D")
    colMapping.append(tup)

In [16]:
merged = pd.concat([d,m,w,q], join="inner", axis=1)
merged.dropna(inplace=True)

merged.to_csv(config["dataLoc"] + "/Factset/merged.csv")

writeRec(colMapping)