In [1]:
#Loading the necessary packages
import numpy as np
import pandas as pd
import re
from datetime import datetime,timedelta,date
from functools import reduce
from dateutil.relativedelta import *
from warnings import filterwarnings
filterwarnings("ignore")
##Dataframe Global variables
pd.set_option('display.max_columns',40)
pd.set_option('max_colwidth',50)

In [2]:
def read_file(filepath,filetype,encoding='ISO-8859-1'):
    """
    This function reads txt and csv files and returns the data
    """
    if(filetype=="txt" or "csv"):
        data = pd.read_csv(filepath,encoding=encoding,low_memory=False)    
    else:
        raise Exception("Kindly Check! Only txt or csv filetype allowed")
    return data

In [3]:
def read_file_chunks(filepath,filetype,chunk_size,encoding='ISO-8859-1'):
    """
    This function reads data from the file in chunks
    """
    if(filetype=="txt" or "csv"):
        chunk_data = pd.read_csv(filepath,encoding=encoding,low_memory=False,chunksize=chunk_size)
        for chunks in range(0,999):
            if(chunks==0):
                card_txn_data = next(chunk_data)
            else:
                try:
                    card_txn_data = card_txn_data.append(next(chunk_data))
                except StopIteration:
                    break
    else:
        raise Exception("Kindly Check! Only txt or csv filetype allowed")
    return card_txn_data

In [163]:
#clothes_cats = ['women s ready to wear stores','women s accessory and specialty shops','miscellaneous apparel and accessory shops','family clothing stores','miscellaneous and specialty retail stores',\
#'men s and women s clothing stores','men s and boy s clothing and accessories stores','men s women s and children s uniforms and commercial clothing','clothing rental costumes formal wear uniforms']

In [4]:
required_cols = ['ArtificialAccountKey', 'POSTAMOUNT','CARDACCEPTORCITY',
       'CARDACCEPTORCOUNTRY', 'CARDACCEPTORNAME', 'CARDACCEPTORSTATE',
       'CARDACCEPTORSTREET', 'LOCALTRANDATE', 'LOCALTRANTIME',
       'MerchantCategory','MISCDATA3','OURACCTCODE1', 'OURCARDTYPE', 'OURSYSTEMDATE', 'OURTRANCODE',
       'RECURRINGFLAG']

In [5]:
chunksize = 350000
## Transactions of customers for past one year
#filepath = r"C:\Users\sugan\Desktop\USF Grow Financial\MajorEvents\USFCardTransactionsLimitedMembers20181107.txt"
filepath = r"C:\Users\ujjwa\Downloads\Grow Backup\Grow\data\Customers_10000\Customers_10000.csv"
filetype = 'csv'

In [6]:
#Loading the data
customer_data = read_file_chunks(filepath,filetype,chunksize)
customer_data=customer_data[required_cols]
##check if data is loaded successfully
if(len(customer_data)==0):
    raise Exception("Data not loaded Kindly check!format of the data file")

In [7]:
customer_data['LOCALTRANDATE'] = pd.to_datetime(customer_data['LOCALTRANDATE'])
customer_data['MONTH'] = customer_data.LOCALTRANDATE.apply(lambda x:format(x.month,'02'))
customer_data['YEAR'] = customer_data.LOCALTRANDATE.apply(lambda x:x.year)
customer_data['MO_YR'] = customer_data['YEAR'].astype('str')+'_'+customer_data['MONTH'].astype('str')
customer_data['DATE'] = customer_data['LOCALTRANDATE'].apply(lambda x:datetime.strftime(x,'%Y-%m-%d'))
customer_data.sort_values(by=['ArtificialAccountKey','LOCALTRANDATE']).reset_index(inplace=True,drop=True)

In [8]:
##Create a new cleaned card acceptor names and merchant category
def clean_name(text,space=''):
    text = str(text).lower()
    temp_text = re.sub('[^a-zA-Z\s]',"",text)
    if(space == "X"):
        temp_text = temp_text.strip()
    else:
        temp_text = temp_text.replace(" ","")
    return temp_text

In [9]:
customer_data['clean_cardacceptor_name'] = customer_data.CARDACCEPTORNAME.apply(lambda x:clean_name(str(x), 'X'))
customer_data['clean_merchant'] = customer_data.MerchantCategory.apply(lambda x:clean_name(str(x),'X'))

In [None]:
## Grouped the data into three categories based on keywords, merchant type and posted amount and assign a probability score for each.
'''
1.Group 1: Customers who have made large transaction on relevant categories (hotels, jewelry, catering, florals) in a month.
2.Group 2: Customers who have spent on bridal stores >300$
3.Group 3: Customers who have spent on photography > 500$
4.Mark probaibility as 'Low' for Group 2 & 3 and 'None' for Group 1.
'''

### Large Transactions (Hotels, Jewelary, Catering, Florists) 

In [10]:
## Keyword and their categories for bridal stores and large categories.
keywords = pd.ExcelFile(r"C:\Users\ujjwa\Downloads\Grow Backup\Grow\data\Wedding_Keywords.xlsx")
bridal_stores = keywords.parse(sheet_name='bride', names=['keyword'])
large_merchant = keywords.parse(sheet_name='large', names=['key','search_term'])

##Create a dictionary with search terms as key and actual names as value
dict_large = {}
for index,row in large_merchant.iterrows():
    dict_large[row['search_term']]= row['key']

In [11]:
## Tagging records for large transactions.
final_class = []
for index,row in customer_data.iterrows():
    name = str(row['clean_merchant'])
    match = []
    for key,item in dict_large.items():
        if(type(re.search(key,name)) is not type(None)):
            match.append(item)
        else:
            pass
    final_class.append(match) 

In [12]:
##Flattening the predictions list
flat_list = []
for sublist in final_class:
    if(len(sublist)==0):
        flat_list.append("")
    else:
        flat_list.append(sublist[0])

In [13]:
##  Group 1: tagging the data with identified large category

customer_data['Identified_large_txn'] = pd.Series(flat_list)
customer_large_txn = customer_data.loc[customer_data.Identified_large_txn != '']
##creating groups
cust_large_grp = customer_large_txn.groupby(by=['ArtificialAccountKey','Identified_large_txn','MO_YR'],as_index=False)['POSTAMOUNT'].agg({'spent':'sum'})
##splitting groups for hotels, jewelary, florist and caterer and specifying min transaction amount for each category.
cust_hotel_df = cust_large_grp.loc[(cust_large_grp.Identified_large_txn == 'hotels') & (cust_large_grp.spent>2000), ['ArtificialAccountKey', 'Identified_large_txn']]
cust_jewelry_df = cust_large_grp.loc[(cust_large_grp.Identified_large_txn == 'jewelry') & (cust_large_grp.spent>2000), ['ArtificialAccountKey', 'Identified_large_txn']]
cust_florists_df = cust_large_grp.loc[(cust_large_grp.Identified_large_txn == 'florists') & (cust_large_grp.spent>500), ['ArtificialAccountKey', 'Identified_large_txn']]
cust_caterer_df = cust_large_grp.loc[(cust_large_grp.Identified_large_txn == 'caterers') & (cust_large_grp.spent>500), ['ArtificialAccountKey', 'Identified_large_txn']]
#contactenating all the large transactions
cust_large_df = pd.concat([cust_hotel_df,cust_jewelry_df, cust_florists_df, cust_caterer_df])
cust_large_df.reset_index(inplace=True,drop=True)
cust_large_df['prob'] = ''
cust_large_df['category'] = 'large'
cust_large_df.rename(columns = {'Identified_large_txn':'clean_cardacceptor_name'}, inplace=True)

### Bridal Stores

In [14]:
## Group 2: Bridal Stores
other_cat = ['theatrical producers except motion pictures ticket agencies',
       'insurance sales underwriting and premiums',
        'commercial photography art and graphics',
       'card shops gift novelty and souvenir shops']

cust_bride_df = pd.DataFrame()
for items in bridal_stores.values.tolist():
    #print(items)
    cust_bride_df = cust_bride_df.append(customer_data.loc[customer_data.clean_cardacceptor_name.str.contains(items[0]) & (~customer_data.clean_merchant.isin(other_cat))])
    
cust_bride_df.reset_index(inplace=True,drop=True)
cust_bride_grp = cust_bride_df.groupby(by=['ArtificialAccountKey','clean_cardacceptor_name','MO_YR'],as_index=False)['POSTAMOUNT'].agg({'spent':'sum'})

##customers who spent more than 300 in bridal stores.
cust_bride_final = cust_bride_grp.loc[(cust_bride_grp.spent>300), ['ArtificialAccountKey', 'clean_cardacceptor_name']]
cust_bride_final.reset_index(inplace=True,drop=True)
cust_bride_final['prob'] = 'low'
cust_bride_final['category'] = 'bridal'

### Photography

In [15]:
## Group 3: Photgraphy
photo_cat = ['commercial photography art and graphics']
customer_photo = customer_data.loc[(customer_data.clean_merchant.isin(photo_cat)),]
customer_photo.reset_index(inplace=True,drop=True)
cust_photo_grp = customer_photo.groupby(by=['ArtificialAccountKey','MO_YR', 'clean_cardacceptor_name'],as_index=False)['POSTAMOUNT'].agg({'spent':'sum'})

##customers who spent more than 500 in photoshoots.
cust_photo_df = cust_photo_grp.loc[(cust_photo_grp.spent>500), ['ArtificialAccountKey', 'clean_cardacceptor_name']]
cust_photo_df.reset_index(inplace=True,drop=True)
cust_photo_df['prob'] = 'low'
cust_photo_df['category'] = 'photography'

## Concatenation and Scoring

In [16]:
## concatenate cust_photo_df, cust_bride_df, cust_large_df for creating a dictionary
final_df = pd.concat([cust_photo_df, cust_bride_final, cust_large_df])
final_df.reset_index(inplace=True,drop=True)

#creating a dictionary for tagging and comparing the spent frequencies.
cust_dict = {}
for index,row in final_df.iterrows():
    if(row.ArtificialAccountKey in cust_dict.keys()):
        if row.clean_cardacceptor_name not in cust_dict[row.ArtificialAccountKey][0]:
            cust_dict[row.ArtificialAccountKey][0].append(row.clean_cardacceptor_name)
        if row.category not in cust_dict[row.ArtificialAccountKey][1]:
            cust_dict[row.ArtificialAccountKey][1].append(row.category) 
        if row.prob not in cust_dict[row.ArtificialAccountKey][2]:
            cust_dict[row.ArtificialAccountKey][2].append(row.prob)    
    else:
        cust_dict[row.ArtificialAccountKey] = [[row.clean_cardacceptor_name],[row.category],[row.prob]]

In [17]:
## Changing the assigned probability with med, high, very high based on the following rules:
'''
If customer is present in Group 2 and 3 -> High
If customer is present in Group 1 and either of Group 2 or 3 -> Medium
If customer is present in all the Groups 1, 2 and 3 -> Very_High
No matches -> Probability remains unchanged.
'''
for i in cust_dict.values():
    if len(i[1])==2:
        ## scored probability as 'High'
        if((i[1][0] == 'photography' and i[1][1] == 'bridal') or (i[1][0] == 'bridal' and i[1][1] == 'photography')):
            i[2] = ['high']
       ## scored probability as 'Medium'
        elif((i[1][0] == 'bridal' and i[1][1] == 'large') or (i[1][0] == 'large' and i[1][1] == 'bridal')):
            i[2] = ['medium']
        elif((i[1][0] == 'photography' and i[1][1] == 'large') or (i[1][0] == 'large' and i[1][1] == 'photography')):
            i[2] = ['medium']
    elif len(i[1])==3:
        ## scored probability as 'Very_High'
        if((i[1][0] == 'photography' and i[1][1] == 'bridal' and i[1][2] == 'large') or (i[1][0] == 'photography' and i[1][1] == 'large' and i[1][2] == 'bridal')):
            i[2] = ['very_high']
        elif((i[1][0] == 'bridal' and i[1][1] == 'photography' and i[1][2] == 'large') or (i[1][0] == 'bridal' and i[1][1] == 'large' and i[1][2] == 'photography')):
            i[2] = ['very_high']
        elif((i[1][0] == 'large' and i[1][1] == 'photography' and i[1][2] == 'bridal') or (i[1][0] == 'large' and i[1][1] == 'bridal' and i[1][2] == 'photography')):
            i[2] = ['very_high']
            
## converting the dictionary into dataframe
output_df = pd.DataFrame()
for key,item in cust_dict.items():
    customer_key = key
    matched_card_names = ','.join(item[0])
    matched_category = ','.join(item[1])
    prob = item[2]
    output_df = pd.concat([output_df,pd.DataFrame({'customer_key':customer_key,'card_acceptor_name':matched_card_names,'category':matched_category,'probability':prob})])

## Eliminating other unscored customers
final_scored_df = output_df.loc[output_df.probability != ""]    
final_scored_df.reset_index(inplace=True,drop=True)

## Output

In [None]:
## output - list of customers scored with probability of marriage.
final_scored_df   