## 4.6 Merging and exporting data

### This script contains the following points:

#### 1. Create data to experiment on
#### 2. Concatenate dataframes
#### 3. Append data
#### 4. Merge data

In [15]:
# Import libraries

import pandas as pd
import numpy as np
import os

#### 1. Create data to experiment on

In [17]:
# Define a dictionary containing January 2020 data 

data1 = {'customer_id':['6732', '767', '890', '635'], 
        'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'], 
        'purchased_meat':[0, 13, 3, 4], 
        'purchased_alcohol':[1, 2, 10, 0],
        'purchased_snacks': [10, 5, 1, 7]} 

In [18]:
# Define a dictionary containing February 2020 data 

data2 = {'customer_id':['6732', '767', '890', '635'], 
        'month':['Feb-20', 'Feb-20', 'Feb-20', 'Feb-20'], 
        'purchased_meat':[0, 10, 5, 3], 
        'purchased_alcohol':[2, 4, 14, 0],
        'purchased_snacks': [15, 3, 2, 6]} 

In [19]:
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
df_1 = pd.DataFrame(data2,index=[0, 1, 2, 3])

In [20]:
df

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7


In [21]:
df_1

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


#### 2. Concatenate dataframes

In [23]:
# Create a list that contains our dataframes

frames = [df, df_1]

In [24]:
# Check the output

frames

[  customer_id   month  purchased_meat  purchased_alcohol  purchased_snacks
 0        6732  Jan-20               0                  1                10
 1         767  Jan-20              13                  2                 5
 2         890  Jan-20               3                 10                 1
 3         635  Jan-20               4                  0                 7,
   customer_id   month  purchased_meat  purchased_alcohol  purchased_snacks
 0        6732  Feb-20               0                  2                15
 1         767  Feb-20              10                  4                 3
 2         890  Feb-20               5                 14                 2
 3         635  Feb-20               3                  0                 6]

In [25]:
# Check the data types to be sure it is a list

type(frames)

list

In [26]:
# Concatenate the dataframes using default options

df_concat = pd.concat(frames)

In [27]:
# Check the output

df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


In [28]:
# Concatenate the dataframes using the axis = 1 --> create a wide format as an output

df_concat = pd.concat(frames, axis = 1)

In [29]:
# Check the output

df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,customer_id.1,month.1,purchased_meat.1,purchased_alcohol.1,purchased_snacks.1
0,6732,Jan-20,0,1,10,6732,Feb-20,0,2,15
1,767,Jan-20,13,2,5,767,Feb-20,10,4,3
2,890,Jan-20,3,10,1,890,Feb-20,5,14,2
3,635,Jan-20,4,0,7,635,Feb-20,3,0,6


#### 3. Append data

In [31]:
df_appended = df.append(df_1)

AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
df_appended

In [42]:
# Create data with different columns from df

data3 = {'customer_id':['6732', '767', '890', '635'], 
        'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'], 
        'days_purchased_on':[0, 13, 3, 4]} 

In [43]:
# Convert to dataframe

df_2 = pd.DataFrame(data3,index=[0, 1, 2, 3])

In [44]:
df_2

Unnamed: 0,customer_id,month,days_purchased_on
0,6732,Jan-20,0
1,767,Jan-20,13
2,890,Jan-20,3
3,635,Jan-20,4


In [45]:
# Create a new dataset combining df and df_2

df_append_test = df.append(df_2)

AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
df_append_test

#### 4. Merge data

In [55]:
# Merge df and df_2 using customer_id as a key 

df_merged = df.merge(df_2, on = ['customer_id'])

In [56]:
df_merged

Unnamed: 0,customer_id,month_x,purchased_meat,purchased_alcohol,purchased_snacks,month_y,days_purchased_on
0,6732,Jan-20,0,1,10,Jan-20,0
1,767,Jan-20,13,2,5,Jan-20,13
2,890,Jan-20,3,10,1,Jan-20,3
3,635,Jan-20,4,0,7,Jan-20,4


In [57]:
# Merge df and df_2 using customer_id and month as a keys 

df_merged = df.merge(df_2, on = ['customer_id', 'month'])

In [58]:
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,6732,Jan-20,0,1,10,0
1,767,Jan-20,13,2,5,13
2,890,Jan-20,3,10,1,3
3,635,Jan-20,4,0,7,4


In [59]:
# Merge df and df_2 using customer_id and month as a keys, add a merge flag

df_merged = df.merge(df_2, on = ['customer_id', 'month'], indicator = True)

In [60]:
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,13,both
2,890,Jan-20,3,10,1,3,both
3,635,Jan-20,4,0,7,4,both


In [61]:
df_merged['_merge'].value_counts()

_merge
both          4
left_only     0
right_only    0
Name: count, dtype: int64

In [62]:
# Test merge without overwriting

pd.merge(df,df_2, on = ['customer_id', 'month'], indicator = True)

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,13,both
2,890,Jan-20,3,10,1,3,both
3,635,Jan-20,4,0,7,4,both


In [84]:
# Adding How - "Inner" join type
df.merge(df_2, on = ['customer_id', 'month'], how = 'inner')

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,6732,Jan-20,0,1,10,0
1,767,Jan-20,13,2,5,13
2,890,Jan-20,3,10,1,3
3,635,Jan-20,4,0,7,4


In [86]:
# Adding How - "left" join type
df.merge(df_2, on = ['customer_id', 'month'], how = 'left')

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,6732,Jan-20,0,1,10,0
1,767,Jan-20,13,2,5,13
2,890,Jan-20,3,10,1,3
3,635,Jan-20,4,0,7,4


In [88]:
# Adding How - "right" join type
df.merge(df_2, on = ['customer_id', 'month'], how = 'right')

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,6732,Jan-20,0,1,10,0
1,767,Jan-20,13,2,5,13
2,890,Jan-20,3,10,1,3
3,635,Jan-20,4,0,7,4


In [90]:
# Adding How - "outer" join type
df.merge(df_2, on = ['customer_id', 'month'], how = 'outer')

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,635,Jan-20,4,0,7,4
1,6732,Jan-20,0,1,10,0
2,767,Jan-20,13,2,5,13
3,890,Jan-20,3,10,1,3
