In [1]:
import pandas as pd
from dateutil import parser
import datetime

In [4]:
datalib = "/home/tbrowne/Mastercard/"

In [5]:
cols = ["Country", "Program", "Customer", "Driver", "Month", "Amount"]
df = pd.read_csv(datalib+"mea-processed_1Aug2018.csv",names=cols)

In [6]:
# Convert their date format to a standard date
df["Month"] = df["Month"].apply(parser.parse)

In [7]:
monthMin = df["Month"].min()
monthMax = df["Month"].max()
print("Full dataset runs from {} - {}".format(monthMin, monthMax))
fullVolume = df["Amount"].sum()

Full dataset runs from 2015-01-01 00:00:00 - 2018-05-01 00:00:00


###### Select only the top 12 countries

In [8]:
before = df.shape[0]
countries = ['United Arab Emirates', 'South Africa', 'Saudi Arabia', 'Nigeria', 'Egypt',\
            'Qatar', 'Lebanon', 'Kuwait', 'Pakistan', 'Jordan','Libya', 'Tunisia']
df = df.loc[df["Country"].isin(countries)]
after = df.shape[0]
diff = before - after
print("{:,.0f} rows removed due to Country filter, leaving {:,.0f}".\
      format(diff, after))
monthMin = df["Month"].min()
monthMax = df["Month"].max()
print("Twelve countries runs from {} - {}".format(monthMin, monthMax))
twelveVolume = df["Amount"].sum()
pct = twelveVolume/fullVolume
print("Twelve countries account for {:.1%} of the Volume".format(pct))

295,318 rows removed due to Country filter, leaving 308,637
Twelve countries runs from 2015-01-01 00:00:00 - 2018-05-01 00:00:00
Twelve countries account for 89.0% of the Volume


###### Remove "Losses", which are when we don't have recent data for a Country/Program/Customer combination

In [9]:
output = open(datalib+"Wins and Losses.txt", "w")

In [10]:
def report(idx, indicator):
    rec = "|".join(idx)
    rec = rec +"|"+ indicator +"\n"
    output.write(rec)

In [11]:
# "Win" is when you have a recent customer so not enough data to forecast
# "Loss" is when you don't have recent data, so no longer active/relevant
def removeWinsandLosses(df, cutoff):
    sort = ["Country", "Program", "Customer", "Driver", "Month"]
    df = df.sort_values(sort)
    df = df.set_index(["Country", "Program", "Customer", "Driver"], drop=True)
    cutoff = datetime.datetime.strptime(cutoff, "%Y-%m-%d")
    loss = 0
    win  = 0

    for idx, val in df.groupby(level=[0, 1, 2, 3]):
        dates = val["Month"]
        if dates.iloc[-1] < cutoff:    # Check for Loss
            report(idx, "L")
            df = df.drop(idx)
            loss += 1
        else:                        # Check for Win
            if len(dates) < 12:
                win += 1
                report(idx, "W")
                df = df.drop(idx)
    print("{:,.0f} Loss and {:,.0f} wins".format(loss, win))
    return df.reset_index()

In [12]:
df = df.reset_index()
df = removeWinsandLosses(df, "2018-01-30")

output.close()

1,396 Loss and 1,052 wins


In [13]:
df.to_csv(datalib+"Processed.csv", sep=",", index=False)