In [1]:
import pandas as pd
import numpy as np

In [7]:
portfolio = pd.read_json('portfolio.json', orient='records', lines=True)
profile = pd.read_json('profile.json', orient='records', lines=True)
transcript = pd.read_json('transcript.json', orient='records', lines=True)

In [6]:
portfolio.head()

Unnamed: 0,channels,difficulty,duration,id,offer_type,reward
0,"[email, mobile, social]",10,7,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10
1,"[web, email, mobile, social]",10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10
2,"[web, email, mobile]",0,4,3f207df678b143eea3cee63160fa8bed,informational,0
3,"[web, email, mobile]",5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5
4,"[web, email]",20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,5


In [9]:
profile.head()

Unnamed: 0,age,became_member_on,gender,id,income
0,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,
1,55,20170715,F,0610b486422d4921ae7d2bf64640c50b,112000.0
2,118,20180712,,38fe809add3b4fcf9315a9694bb96ff5,
3,75,20170509,F,78afa995795e4d85b5d9ceeca43f5fef,100000.0
4,118,20170804,,a03223e636434f42ac4c3df47e8bac43,


In [10]:
transcript.head()

Unnamed: 0,event,person,time,value
0,offer received,78afa995795e4d85b5d9ceeca43f5fef,0,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'}
1,offer received,a03223e636434f42ac4c3df47e8bac43,0,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'}
2,offer received,e2127556f4f64592b11af22de27a7932,0,{'offer id': '2906b810c7d4411798c6938adc9daaa5'}
3,offer received,8ec6ce2a7e7949b1bf142def7d0e0586,0,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'}
4,offer received,68617ca6246f4fbc85e91a2a49552598,0,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'}


In [12]:
print('portfolio:', portfolio.shape)
print('profile:', profile.shape)
print('transcript:', transcript.shape)

portfolio: (10, 6)
profile: (17000, 5)
transcript: (306534, 4)


# we will do the data processing first

# Portfolio
The channels column has different modes in a list, hence we will make a column for each mode and add values 1 or 0 to them 
offer_type column needs to be one hot encoded
we will add one column names 'offer_code' which will have the value = index_value+1

In [32]:

portfolio_new = pd.get_dummies(portfolio, columns=['offer_type'])
portfolio_new.columns

Index(['channels', 'difficulty', 'duration', 'id', 'reward', 'offer_type_bogo',
       'offer_type_discount', 'offer_type_informational'],
      dtype='object')

In [33]:
portfolio_new['email'] = portfolio_new.channels.apply(lambda x: 1 if 'email' in x else 0)

In [34]:
portfolio_new['web'] = portfolio_new.channels.apply(lambda x: 1 if 'web' in x else 0)

In [35]:
portfolio_new['social'] = portfolio_new.channels.apply(lambda x: 1 if 'social' in x else 0)

In [36]:
portfolio_new['mobile'] = portfolio_new.channels.apply(lambda x: 1 if 'mobile' in x else 0)

In [37]:
portfolio_new = portfolio_new.drop('channels', axis=1)

In [38]:
portfolio_new.head()

Unnamed: 0,difficulty,duration,id,reward,offer_type_bogo,offer_type_discount,offer_type_informational,email,web,social,mobile
0,10,7,ae264e3637204a6fb9bb56bc8210ddfd,10,1,0,0,1,0,1,1
1,10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,10,1,0,0,1,1,1,1
2,0,4,3f207df678b143eea3cee63160fa8bed,0,0,0,1,1,1,0,1
3,5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,1,0,0,1,1,0,1
4,20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,5,0,1,0,1,1,0,0


In [39]:
portfolio_new['offer_code'] = portfolio_new.index.values+1

In [40]:
portfolio_new.head()

Unnamed: 0,difficulty,duration,id,reward,offer_type_bogo,offer_type_discount,offer_type_informational,email,web,social,mobile,offer_code
0,10,7,ae264e3637204a6fb9bb56bc8210ddfd,10,1,0,0,1,0,1,1,1
1,10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,10,1,0,0,1,1,1,1,2
2,0,4,3f207df678b143eea3cee63160fa8bed,0,0,0,1,1,1,0,1,3
3,5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,1,0,0,1,1,0,1,4
4,20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,5,0,1,0,1,1,0,0,5


In [54]:
cols = portfolio_new.columns.tolist()
cols = cols[-1:]+cols[:-1]
cols

['offer_code',
 'difficulty',
 'duration',
 'id',
 'reward',
 'offer_type_bogo',
 'offer_type_discount',
 'offer_type_informational',
 'email',
 'web',
 'social',
 'mobile']

In [55]:
portfolio_new = portfolio_new[cols]
portfolio_new.head()

Unnamed: 0,offer_code,difficulty,duration,id,reward,offer_type_bogo,offer_type_discount,offer_type_informational,email,web,social,mobile
0,1,10,7,ae264e3637204a6fb9bb56bc8210ddfd,10,1,0,0,1,0,1,1
1,2,10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,10,1,0,0,1,1,1,1
2,3,0,4,3f207df678b143eea3cee63160fa8bed,0,0,0,1,1,1,0,1
3,4,5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,1,0,0,1,1,0,1
4,5,20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,5,0,1,0,1,1,0,0


# Profile

In [64]:
print(profile.head())
print(profile.shape)

   age  became_member_on gender                                id    income
0  118          20170212   None  68be06ca386d4c31939f3a4f0e3dd783       NaN
1   55          20170715      F  0610b486422d4921ae7d2bf64640c50b  112000.0
2  118          20180712   None  38fe809add3b4fcf9315a9694bb96ff5       NaN
3   75          20170509      F  78afa995795e4d85b5d9ceeca43f5fef  100000.0
4  118          20170804   None  a03223e636434f42ac4c3df47e8bac43       NaN
(17000, 5)


In [62]:
profile.isna().sum()

age                    0
became_member_on       0
gender              2175
id                     0
income              2175
dtype: int64

In [63]:
profile['gender'].value_counts()

M    8484
F    6129
O     212
Name: gender, dtype: int64

In [67]:
profile.dropna(inplace=True)

In [68]:
profile.isna().sum()

age                 0
became_member_on    0
gender              0
id                  0
income              0
dtype: int64

In [70]:
profile = pd.get_dummies(profile, columns=['gender'])

In [None]:
profile['became_member_on'] = pd.to_datetime(profile.became_member_on, format='%Y%m%d')

In [71]:
profile_new = profile.copy()
profile_new.head()

Unnamed: 0,age,became_member_on,id,income,gender_F,gender_M,gender_O
1,55,2017-07-15,0610b486422d4921ae7d2bf64640c50b,112000.0,1,0,0
3,75,2017-05-09,78afa995795e4d85b5d9ceeca43f5fef,100000.0,1,0,0
5,68,2018-04-26,e2127556f4f64592b11af22de27a7932,70000.0,0,1,0
8,65,2018-02-09,389bc3fa690240e798340f5a15918d5c,53000.0,0,1,0
12,58,2017-11-11,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,0,1,0


In [74]:
profile_new[['age']].describe()

Unnamed: 0,age
count,14825.0
mean,54.393524
std,17.383705
min,18.0
25%,42.0
50%,55.0
75%,66.0
max,101.0


In [75]:
profile_new['became_member_on_year'] = profile_new.became_member_on.dt.year
profile_new['became_member_on_month'] = profile_new.became_member_on.dt.month
profile_new['became_member_on_date'] = profile_new.became_member_on.dt.day
profile_new.drop(columns=['became_member_on'], inplace=True)

In [76]:
profile_new.head()

Unnamed: 0,age,id,income,gender_F,gender_M,gender_O,became_member_on_year,became_member_on_month,became_member_on_date
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,1,0,0,2017,7,15
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,1,0,0,2017,5,9
5,68,e2127556f4f64592b11af22de27a7932,70000.0,0,1,0,2018,4,26
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,0,1,0,2018,2,9
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,0,1,0,2017,11,11


# Transcript

In [103]:
transcript = pd.read_json('transcript.json', orient='records', lines=True)
transcript.head()

Unnamed: 0,event,person,time,value
0,offer received,78afa995795e4d85b5d9ceeca43f5fef,0,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'}
1,offer received,a03223e636434f42ac4c3df47e8bac43,0,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'}
2,offer received,e2127556f4f64592b11af22de27a7932,0,{'offer id': '2906b810c7d4411798c6938adc9daaa5'}
3,offer received,8ec6ce2a7e7949b1bf142def7d0e0586,0,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'}
4,offer received,68617ca6246f4fbc85e91a2a49552598,0,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'}


In [104]:
transcript['event'].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [105]:
transcript_new = pd.get_dummies(transcript, columns = ['event'])

In [106]:
transcript_new.columns

Index(['person', 'time', 'value', 'event_offer completed',
       'event_offer received', 'event_offer viewed', 'event_transaction'],
      dtype='object')

# value column has dictionaries in it. for example offer_id, reward and amount

In [108]:
def get_offer_id(data):
    try:
        return data['offer id']
    except KeyError:
        try:
            return data['offer_id']
        except:
            return ''

In [109]:
transcript_new.head()

Unnamed: 0,person,time,value,event_offer completed,event_offer received,event_offer viewed,event_transaction
0,78afa995795e4d85b5d9ceeca43f5fef,0,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,1,0,0
1,a03223e636434f42ac4c3df47e8bac43,0,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0,1,0,0
2,e2127556f4f64592b11af22de27a7932,0,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0,1,0,0
3,8ec6ce2a7e7949b1bf142def7d0e0586,0,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0,1,0,0
4,68617ca6246f4fbc85e91a2a49552598,0,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0,1,0,0


In [110]:
transcript_new['offer_id'] = transcript_new.value.apply(get_offer_id)

In [112]:
def get_reward(data):
    try:
        return data['reward']
    except KeyError:
        return 0


def get_amount(data):
    try:
        return data['amount']
    except KeyError:
        return 0

In [113]:
transcript_new['reward'] = transcript_new.value.apply(get_reward)
transcript_new['amount'] = transcript_new.value.apply(get_amount)

In [114]:
transcript_new.head()

Unnamed: 0,person,time,value,event_offer completed,event_offer received,event_offer viewed,event_transaction,offer_id,reward,amount
0,78afa995795e4d85b5d9ceeca43f5fef,0,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,1,0,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,0.0
1,a03223e636434f42ac4c3df47e8bac43,0,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0,1,0,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,0.0
2,e2127556f4f64592b11af22de27a7932,0,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0,1,0,0,2906b810c7d4411798c6938adc9daaa5,0,0.0
3,8ec6ce2a7e7949b1bf142def7d0e0586,0,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0,1,0,0,fafdcd668e3743c1bb461111dcafc2a4,0,0.0
4,68617ca6246f4fbc85e91a2a49552598,0,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0,1,0,0,4d5c57ea9a6940dd891ad53e9dbe8da0,0,0.0


In [118]:
transcript_new = transcript_new[['person', 'time', 'offer_id', 'event_offer received', 'event_offer viewed', 'event_offer completed',
         'event_transaction', 'reward', 'amount']]

In [119]:
transcript_new[transcript_new.person == 'a03223e636434f42ac4c3df47e8bac43']

Unnamed: 0,person,time,offer_id,event_offer received,event_offer viewed,event_offer completed,event_transaction,reward,amount
1,a03223e636434f42ac4c3df47e8bac43,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,0,0,0.0
15562,a03223e636434f42ac4c3df47e8bac43,6,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,1,0,0,0,0.0
90553,a03223e636434f42ac4c3df47e8bac43,234,,0,0,0,1,0,1.09
97700,a03223e636434f42ac4c3df47e8bac43,264,,0,0,0,1,0,3.5
110829,a03223e636434f42ac4c3df47e8bac43,336,3f207df678b143eea3cee63160fa8bed,1,0,0,0,0,0.0
123539,a03223e636434f42ac4c3df47e8bac43,336,3f207df678b143eea3cee63160fa8bed,0,1,0,0,0,0.0
150599,a03223e636434f42ac4c3df47e8bac43,408,5a8bc65990b245e5a138643cd4eb9837,1,0,0,0,0,0.0
201573,a03223e636434f42ac4c3df47e8bac43,504,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,0,0,0.0
245125,a03223e636434f42ac4c3df47e8bac43,576,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,0,0,0.0
277104,a03223e636434f42ac4c3df47e8bac43,612,,0,0,0,1,0,0.06


In [124]:
'''When an individual has utilized an offer, there are two transactions records created, one for claiming the 
        reward another for making the purchase. We are going to consolidate these two transaction records into one.
    '''
transcript_new = transcript_new.groupby(['person', 'time'], as_index=False).agg('max')

In [125]:
transcript_new[transcript_new.person == 'a03223e636434f42ac4c3df47e8bac43']

Unnamed: 0,person,time,offer_id,event_offer received,event_offer viewed,event_offer completed,event_transaction,reward,amount
156856,a03223e636434f42ac4c3df47e8bac43,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,0,0,0.0
156857,a03223e636434f42ac4c3df47e8bac43,6,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,1,0,0,0,0.0
156858,a03223e636434f42ac4c3df47e8bac43,234,,0,0,0,1,0,1.09
156859,a03223e636434f42ac4c3df47e8bac43,264,,0,0,0,1,0,3.5
156860,a03223e636434f42ac4c3df47e8bac43,336,3f207df678b143eea3cee63160fa8bed,1,1,0,0,0,0.0
156861,a03223e636434f42ac4c3df47e8bac43,408,5a8bc65990b245e5a138643cd4eb9837,1,0,0,0,0,0.0
156862,a03223e636434f42ac4c3df47e8bac43,504,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,0,0,0.0
156863,a03223e636434f42ac4c3df47e8bac43,576,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,0,0,0.0
156864,a03223e636434f42ac4c3df47e8bac43,612,,0,0,0,1,0,0.06
156865,a03223e636434f42ac4c3df47e8bac43,624,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,1,0,0,0,0.0


In [126]:
def get_duration(offer_id):
    portfolio = pd.read_json('portfolio.json', orient='records', lines=True)
    if offer_id.strip() != '':
        return portfolio[portfolio.id == offer_id]['duration'].values[0]
    else:
        return 0

In [128]:
transcript_clean = transcript_new
transcript_clean.rename(columns={'event_offer completed': 'offer_completed',
                                     'event_offer received': 'offer_received',
                                     'event_offer viewed': 'offer_viewed',
                                     'event_transaction': 'transaction'},
                            inplace=True)
transcript_clean['duration'] = transcript_clean[transcript_clean.offer_received == 1].offer_id.apply(get_duration)

In [129]:
transcript_clean.duration.fillna(0, inplace=True)
transcript_clean['duration'] = transcript_clean.duration.apply(lambda x: x * 24)
transcript_clean['expiration'] = transcript_clean.time + transcript_clean.duration
transcript_clean.drop(columns='duration', inplace=True)

In [130]:
transcript_clean = transcript_clean[['person', 'time', 'expiration', 'offer_id', 'offer_received', 'offer_viewed',
                                         'offer_completed', 'transaction', 'reward', 'amount']]

transcript_clean['expiration'] = transcript_clean.expiration.astype(int)

In [131]:
'''
        From the above output, it looks like we have populated the transactions that are not offer received with the 
        transaction timestamp. We need to fill with correct offer expiration time if offer id exists.
'''
idx = transcript_clean[transcript_clean.offer_received == 0].index
transcript_clean['expiration'].iloc[idx] = None
transcript_clean.expiration = transcript_clean.expiration.fillna(value=transcript_clean.time)
transcript_clean['expiration'] = transcript_clean.expiration.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [132]:
idx = transcript_clean[(transcript_clean.offer_id != '')
                           & (transcript_clean.offer_received == 0)].index

transcript_clean['expiration'].iloc[idx] = None
transcript_clean.expiration = transcript_clean.expiration.fillna(method='ffill')
transcript_clean['expiration'] = transcript_clean.expiration.astype(int)

In [133]:
'''
        We will use time columns information to create new columns: offer_received_time, offer_viewed_time, 
        offer_completed_time
'''
transcript_clean['offer_received_time'] = transcript_clean[transcript_clean.offer_received == 1]['time']
transcript_clean['offer_viewed_time'] = transcript_clean[transcript_clean.offer_viewed == 1]['time']
transcript_clean['offer_completed_time'] = transcript_clean[transcript_clean.offer_completed == 1]['time']
transcript_clean.offer_received_time.fillna(0, inplace=True)
transcript_clean.offer_viewed_time.fillna(0, inplace=True)
transcript_clean.offer_completed_time.fillna(0, inplace=True)

In [134]:
'''
         A person can receive the same offer multiple times. To consolidate transaction records associated within 
         offer expiration time, we will create a new column "offerid_expiration" and use this column to group the 
         transactions.
'''
transcript_clean['offerid_expiration'] = ''
idx = transcript_clean[transcript_clean.offer_id != ''].index
transcript_clean['expiration'] = transcript_clean.expiration.astype(str)
transcript_clean['offerid_expiration'].iloc[idx] = transcript_clean['offer_id'].iloc[idx] + \
                                                       transcript_clean['expiration'].iloc[idx]

transcript_clean['expiration'] = transcript_clean.expiration.astype(int)

In [135]:
'''
        Consolidate transaction records associated within offer expiration time
    '''
transcript_time = transcript_clean.groupby(['person', 'offerid_expiration'], as_index=False)[['amount',
                                                                                                  'offer_id',
                                                                                                  'offer_received_time',
                                                                                                  'offer_viewed_time',
                                                                                                  'offer_completed_time']].max()

transcript_clean.drop(columns=['offer_received_time', 'offer_viewed_time', 'offer_completed_time'],
                          inplace=True)

In [136]:
transcript_clean = transcript_clean.merge(transcript_time,
                                              left_on=['person', 'offerid_expiration'],
                                              right_on=['person', 'offerid_expiration'],
                                              how='outer')

transcript_clean.fillna(0, inplace=True)
transcript_clean = transcript_clean.sort_values(by=['person', 'time'])
transcript_clean.drop(columns=['offerid_expiration', 'offer_id_y'], inplace=True)
transcript_clean.rename(columns={'offer_id_x': 'offer_id'}, inplace=True)

In [137]:
'''
        We still have different transaction records for viewing/ completing. We will remove rows these rows as have 
        already captured this information in offer received transaction.
    '''
idx = transcript_clean[(transcript_clean.offer_id != '') & (transcript_clean.offer_received == 0)].index
transcript_clean.drop(labels=idx, inplace=True)
transcript_clean.reset_index(inplace=True, drop=True)

In [138]:
'''
        When we consolidated the transactions, for purchases that were performed without coupon, "amount_y" column is 
        populated by maximum amount spent by the person. We need to correct this.
    '''
transcript_clean['amount'] = transcript_clean[transcript_clean.offer_id == '']['amount_x']
transcript_clean['amount'] = transcript_clean.amount.fillna(value=transcript_clean.amount_y)
transcript_clean.drop(columns=['amount_x', 'amount_y'], inplace=True)

In [139]:
'''
         For regular transactions, we still have the expiration column populated. We will fill the expiration with 0.
    '''
idx = transcript_clean[transcript_clean.offer_id == ''].index
transcript_clean['expiration'].iloc[idx] = 0


In [140]:
'''
        A user is deemed to be influenced by promotion only after the individual made a transaction after viewing the 
        advertisement. We will create a new column and populate if the promotion or not influence the individual. 
'''
idx = transcript_clean[(transcript_clean.offer_viewed_time > 0)
                           & (transcript_clean.offer_viewed_time > transcript_clean.offer_received_time)
                           & (transcript_clean.offer_completed_time > transcript_clean.offer_viewed_time)].index

transcript_clean['influenced'] = 0
transcript_clean['influenced'].iloc[idx] = 1

In [141]:
'''
        Create a new column to capture transaction time.
    '''
transcript_clean['offer_received_time'] = transcript_clean.offer_received_time.astype(int)
transcript_clean['offer_viewed_time'] = transcript_clean.offer_viewed_time.astype(int)
transcript_clean['offer_completed_time'] = transcript_clean.offer_completed_time.astype(int)
transcript_clean['transaction_time'] = 0

idx = transcript_clean[transcript_clean.transaction == 1].index
transcript_clean['transaction_time'].iloc[idx] = transcript_clean['time'].iloc[idx]

idx = transcript_clean[transcript_clean.transaction == 0].index
transcript_clean['transaction_time'].iloc[idx] = transcript_clean['offer_completed_time'].iloc[idx]


In [142]:
'''
        When the transactions are consolidated, we lost information about  offer_received, offer_viewed, 
        offer_completed columns. We need to populate with correct values.
    '''

transcript_clean['offer_received'] = 0

idx = transcript_clean[transcript_clean.offer_received_time > 0].index
transcript_clean['offer_received'].iloc[idx] = 1


In [143]:
transcript_clean['offer_viewed'] = 0

idx = transcript_clean[transcript_clean.offer_viewed_time > 0].index
transcript_clean['offer_viewed'].iloc[idx] = 1

transcript_clean['offer_completed'] = 0

idx = transcript_clean[transcript_clean.offer_completed_time > 0].index
transcript_clean['offer_completed'].iloc[idx] = 1

In [144]:
transcript_clean = transcript_clean[['person', 'offer_id', 'time', 'offer_received_time', 'offer_viewed_time',
                                         'offer_completed_time', 'transaction_time', 'expiration', 'offer_received',
                                         'offer_viewed', 'offer_completed', 'transaction', 'reward', 'amount',
                                         'influenced']]

'''
        We no longer need "time" and "expiration" information. We will drop these columns.
'''
transcript_clean.drop(columns=['time', 'expiration'], inplace=True)

# Write all the preprocessed into CSV files

In [145]:
transcript_clean.to_csv('transcript_clean.csv')

In [146]:
profile_new.to_csv('profile_new.csv')

In [147]:
portfolio_new.to_csv('portfolio_new.csv')