# Combining and Merging Datasets

### pandas.merge connects rows in DataFrames based on one or more keys. 

### This will be familiar to users of SQL or other relational databases, as it implements database join operations.

pandas.concat concatenates or “stacks” together objects along an axis

<img src="Pandas_Images\merge.JPG">

<img src="Pandas_Images\syntax_merge.JPG">

# Database-Style DataFrame Joins
Merge or join operations combine datasets by linking rows using one or more keys.
These operations are central to relational databases (e.g., SQL-based). The merge
function in pandas is the main entry point for using these algorithms on your data.

In [2]:
import pandas as pd
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})


df1

In [5]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [3]:
#One to Many
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


Note that I didn’t specify which column to join on. If that information is not speci‐
fied, merge uses the overlapping column names as the keys. It’s a good practice to
specify explicitly, though

In [6]:
pd.merge(df1, df2, on='key')


Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


If the column names are different in each object, you can specify them separately:

In [7]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [None]:
""" 'inner' Use only the key combinations observed in both tables
'left' Use all key combinations found in the left table
'right' Use all key combinations found in the right table
'output' Use all key combinations observed in both tables together """

<img src="Pandas_Images\join.JPG">

In [None]:
#Many-to-many merges have well-defined, though not necessarily intuitive, behavior.
#Here’s an example:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                     'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
        'data2': range(5)})


In [9]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [14]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                         'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                         'key2': ['one', 'one', 'one', 'two'],
                        'rval': [4, 5, 6, 7]})
#pd.merge(left, right, on=['key1', 'key2'], how='outer')
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


"""
left DataFrame to be merged on the left side.

right DataFrame to be merged on the right side.

how One of 'inner', 'outer', 'left', or 'right'; defaults to 'inner'.

on Column names to join on. Must be found in both DataFrame objects. If not specified and no other join keys
given, will use the intersection of the column names in left and right as the join keys.
left_on Columns in left DataFrame to use as join keys.
right_on Analogous to left_on for left DataFrame.
left_index Use row index in left as its join key (or keys, if a MultiIndex).
right_index Analogous to left_index.
sort Sort merged data lexicographically by join keys; True by default (disable to get better performance in
some cases on large datasets).
suffixes Tuple of string values to append to column names in case of overlap; defaults to ('_x', '_y') (e.g., if
'data' in both DataFrame objects, would appear as 'data_x' and 'data_y' in result).
copy If False, avoid copying data into resulting data structure in some exceptional cases; by default always
copies.
indicator Adds a special column _merge that indicates the source of each row; values will be 'left_only',
'right_only', or 'both' based on the origin of the joined data in each row.
"""

# Merging on Index

In some cases, the merge key(s) in a DataFrame will be found in its index. In this
case, you can pass left_index=True or right_index=True (or both) to indicate that
the index should be used as the merge key:

In [15]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                        'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
left1


Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [16]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [17]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


Since the default merge method is to intersect the join keys, you can instead form the
union of them with an outer join:

In [18]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


Using the indexes of both sides of the merge is also possible:

In [19]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                    index=['a', 'c', 'e'],
                    columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                    index=['b', 'c', 'd', 'e'],
                    columns=['Missouri', 'Alabama'])
left2


Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [20]:
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [21]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


<img src="Pandas_Images\Merge1.JPG">

# Join

DataFrame has a convenient join instance for merging by index. It can also be used
to combine together many DataFrame objects having the same or similar indexes but
non-overlapping columns. 

In [22]:
left2.join(right2, how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


DataFrame’s join
method performs a left join on the join keys, exactly preserving the left frame’s row
index. It also supports joining the index of the passed DataFrame on one of the col‐
umns of the calling DataFrame:


In [23]:
left1.join(right1, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


# Join on indexes (another way of merging)

df_new = df1.join(other=df2, on='col1', how='outer')


df_new = df1.join(other=df2,on=['a','b'], how='outer')


### Note: DataFrame.join() joins on indexes by default.
DataFrame.merge() joins on common columns by
default. 

# Concatenating Along an Axis

<img src="Pandas_Images\concat.JPG">

In [25]:
import numpy as np
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                    columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                    columns=['three', 'four'])


In [27]:
print(df1)
df2

   one  two
a    0    1
b    2    3
c    4    5


Unnamed: 0,three,four
a,5,6
c,7,8


In [28]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


# Simple concatenation is often the best

### df=pd.concat([df1,df2],axis=0) #top/bottom

### df = df1.append([df2, df3])  #top/bottom

#### df=pd.concat([df1,df2],axis=1) #left/right


Note: can end up with duplicate rows or cols
Note: concat has an ignore_index parameter

# Reshaping and Pivoting

In [None]:
#stack : This “rotates” or pivots from the columns in the data to the rows
#unstack : This pivots from the rows into the columns

In [29]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
     index=pd.Index(['Ohio', 'Colorado'], name='state'),
     columns=pd.Index(['one', 'two', 'three'],
     name='number'))

In [30]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


# Stack

In [31]:
 result = data.stack()

In [32]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

# You can rearrange the data back into a Data‐Frame with unstack:


In [34]:
result.unstack()


number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


# By default the innermost level is unstacked (same with stack). You can unstack a different level by passing a level number or name:


In [35]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [36]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


# Pivoting “Long” to “Wide” Format

In [37]:
import pandas as pd 
  
# creating a dataframe 
df = pd.DataFrame({'A': ['John', 'Boby', 'Mina'], 
      'B': ['Masters', 'Graduate', 'Graduate'], 
      'C': [27, 23, 21]}) 

In [38]:
df

Unnamed: 0,A,B,C
0,John,Masters,27
1,Boby,Graduate,23
2,Mina,Graduate,21


In [39]:
df.pivot('A', 'B', 'C') 

B,Graduate,Masters
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Boby,23.0,
John,,27.0
Mina,21.0,


In [40]:
df.pivot(index ='A', columns ='B', values =['C', 'A']) 

Unnamed: 0_level_0,C,C,A,A
B,Graduate,Masters,Graduate,Masters
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Boby,23.0,,Boby,
John,,27.0,,John
Mina,21.0,,Mina,


# Pivot_Table Method

# What is a Pivot Table?

   ## A pivot table is a table of statistics that summarizes the data of a more extensive table. 

 The summary of data is reached through various aggregate functions – sum, average, min, max, etc.

### A pivot table is a data processing technique to derive useful information from a table.

In [4]:
import pandas as pd 
df = pd.DataFrame(
    {
        "fruit": ["apple", "orange", "apple", "avocado", "orange"],
        "customer": ["ben", "alice", "ben", "josh", "steve"],
        "quantity": [1, 2, 3, 1, 2],
    }
)

In [7]:
df

Unnamed: 0,fruit,customer,quantity
0,apple,ben,1
1,orange,alice,2
2,apple,ben,3
3,avocado,josh,1
4,orange,steve,2


In [6]:
# average quantity by fruit
df1 = df.pivot_table(values='quantity', columns='fruit')
df1

fruit,apple,avocado,orange
quantity,2,1,2


# Pivot Table with Agregate Function

The default aggregate function is numpy.mean. We can specify the aggregate function as numpy.sum to generate the total funding by the state.

In [43]:
df.pivot_table(index="fruit", columns="customer", values="quantity", aggfunc=np.sum)

customer,alice,ben,josh,steve
fruit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
apple,,4.0,,
avocado,,,1.0,
orange,2.0,,,2.0


In [44]:
df.pivot_table(index="fruit", columns="customer", values="quantity", aggfunc=np.sum, fill_value=0)

customer,alice,ben,josh,steve
fruit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
apple,0,4,0,0
avocado,0,0,1,0
orange,2,0,0,2


In [None]:
"""How to use the Pandas pivot method
To use the pivot method in Pandas, you need to specify three parameters:

Index: Which column should be used to identify and order your rows vertically
Columns: Which column should be used to create the new columns in our reshaped DataFrame. Each unique value in the column stated here will create a column in our new DataFrame.
Values: Which column(s) should be used to fill the values in the cells of our DataFrame.
"""

# Melt

An inverse operation to pivot for DataFrames is pandas.melt

### Pandas melt() function is used to change the DataFrame format from wide to long. 

### It’s used to create a specific format of the DataFrame object where one or more columns work as identifiers. 

### All the remaining columns are treated as values and unpivoted to the row axis and only two columns – variable and value.

In [45]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
  'A': [1, 2, 3],
  'B': [4, 5, 6],
  'C': [7, 8, 9]})

In [46]:
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [47]:
melted = pd.melt(df, ['key'])

In [48]:
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


# Using pivot, we can reshape back to the original layout:

In [50]:
reshaped = melted.pivot('key', 'variable', 'value')
reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [51]:
#Since the result of pivot creates an index from the column used as the row labels, we
#may want to use reset_index to move the data back into a column:
reshaped.reset_index()


variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [52]:
import pandas as pd

d1 = {"Name": ["Pankaj", "Lisa", "David"], "ID": [1, 2, 3], "Role": ["CEO", "Editor", "Author"]}

df = pd.DataFrame(d1)

# print(df)

df_melted = pd.melt(df, id_vars=["ID"], value_vars=["Name", "Role"], var_name="Attribute", value_name="Value")

print(df_melted)

# unmelting using pivot()

df_unmelted = df_melted.pivot(index='ID', columns='Attribute')

print(df_unmelted)

   ID Attribute   Value
0   1      Name  Pankaj
1   2      Name    Lisa
2   3      Name   David
3   1      Role     CEO
4   2      Role  Editor
5   3      Role  Author
            Value        
Attribute    Name    Role
ID                       
1          Pankaj     CEO
2            Lisa  Editor
3           David  Author
