# Concatenating, Merging and Joining DaraFrames

## Making DataFrames

### Loading Dataset

In [2]:
# importing pandas
import pandas as pd

# csv file location
url = 'https://raw.githubusercontent.com/tariqzahratahdi/DataScience/refs/heads/main/datasets/fortune500.csv'

# making data frame from csv file
data = pd.read_csv(url)

# drop NaN
data.dropna(inplace=True)

# show dataframe
data

Unnamed: 0,company,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country
0,Walmart,1,485873,0.8,13643.0,198825,-7.2,C. Douglas McMillon,General Merchandisers,Retailing,1,USA
1,State Grid,2,315199,-4.4,9571.3,489838,-6.2,Kou Wei,Utilities,Energy,2,China
2,Sinopec Group,3,267518,-9.1,1257.9,310726,-65.0,Wang Yupu,Petroleum Refining,Energy,4,China
3,China National Petroleum,4,262573,-12.3,1867.5,585619,-73.7,Zhang Jianhua,Petroleum Refining,Energy,3,China
4,Toyota Motor,5,254694,7.7,16899.3,437575,-12.3,Akio Toyoda,Motor Vehicles and Parts,Motor Vehicles & Parts,8,Japan
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Teva Pharmaceutical Industries,496,21903,11.5,329.0,92890,-79.3,Yitzhak Peterburg,Pharmaceuticals,Health Care,0,Israel
496,New China Life Insurance,497,21796,-13.3,743.9,100609,-45.6,Wan Feng,Insurance: Life Health (stock),Financials,427,China
497,Wm. Morrison Supermarkets,498,21741,-11.3,406.4,11630,20.4,David T. Potts,Food and Drug Stores,Food & Drug Stores,437,Britain
498,TUI,499,21655,-5.5,1151.7,16247,195.5,Friedrich Joussen,Travel Services,Business Services,467,Germany


Make DataFrame: revenues

In [25]:
# select columns
data_revenues = data[['company', 'revenues', 'country']]

data_revenues

Unnamed: 0,company,revenues,country
0,Walmart,485873,USA
1,State Grid,315199,China
2,Sinopec Group,267518,China
3,China National Petroleum,262573,China
4,Toyota Motor,254694,Japan
...,...,...,...
495,Teva Pharmaceutical Industries,21903,Israel
496,New China Life Insurance,21796,China
497,Wm. Morrison Supermarkets,21741,Britain
498,TUI,21655,Germany


Make DataFrame: profits

In [28]:
# select columns
data_profits = data[['company', 'profits', 'country']]

data_profits

Unnamed: 0,company,profits,country
0,Walmart,13643.0,USA
1,State Grid,9571.3,China
2,Sinopec Group,1257.9,China
3,China National Petroleum,1867.5,China
4,Toyota Motor,16899.3,Japan
...,...,...,...
495,Teva Pharmaceutical Industries,329.0,Israel
496,New China Life Insurance,743.9,China
497,Wm. Morrison Supermarkets,406.4,Britain
498,TUI,1151.7,Germany


Make DataFrame: revenues of top USA companies:

In [29]:
# select top USA companies
data_usa_revenues = data_revenues[data['country']=='USA'].head(3)

data_usa_revenues = data_usa_revenues[['company', 'revenues']]

data_usa_revenues

Unnamed: 0,company,revenues
0,Walmart,485873
8,Apple,215639
9,Exxon Mobil,205004


Make DataFrame: profits of top USA companies:

In [30]:
# select top USA companies
data_usa_profits = data_profits[data['country']=='USA'].head(3)

data_usa_profits = data_usa_profits[['company', 'profits']]

data_usa_profits

Unnamed: 0,company,profits
0,Walmart,13643.0
8,Apple,45687.0
9,Exxon Mobil,7840.0


Make DataFrame: revenues of top China companies:

In [31]:
# select top China companies
data_china_revenues = data_revenues[data['country']=='China'].head(3)

data_china_revenues = data_china_revenues[['company', 'revenues']]

data_china_revenues

Unnamed: 0,company,revenues
1,State Grid,315199
2,Sinopec Group,267518
3,China National Petroleum,262573


Make DataFrame: profits of top China companies:

In [32]:
# select top China companies
data_china_profits = data_profits[data['country']=='China'].head(3)

data_china_profits = data_china_profits[['company', 'profits']]

data_china_profits

Unnamed: 0,company,profits
1,State Grid,9571.3
2,Sinopec Group,1257.9
3,China National Petroleum,1867.5


Make DataFrame: revenues of top Japan companies:

In [33]:
# select top Japan companies
data_japan_revenues = data_revenues[data['country']=='Japan'].head(3)

data_japan_revenues = data_japan_revenues[['company', 'revenues']]

data_japan_revenues

Unnamed: 0,company,revenues
4,Toyota Motor,254694
28,Honda Motor,129198
32,Japan Post Holdings,122990


Make DataFrame: profits of top Japan companies:

In [34]:
# select top Japan companies
data_japan_profits = data_profits[data['country']=='Japan'].head(3)

data_japan_profits = data_japan_profits[['company', 'profits']]

data_japan_profits

Unnamed: 0,company,profits
4,Toyota Motor,16899.3
28,Honda Motor,5690.3
32,Japan Post Holdings,-267.4


## Concatenating DataFrames

Concatenating DataFrames means combining them either vertically (row-wise) or horizontally (column-wise).

We use the `concat()` method to concatenate DataFrames.

*Syntax:*

`pd.concat(objs, axis=0, join='outer', ignore_index=False, keys=None)`

*Parameters:*

* `objs`: A list of Series or DataFrame objects to concatenate.
* `axis`: Determines the concatenation direction (0 for rows, 1 for columns).
* `join`: Specifies how to handle columns when `axis=0` (or rows when `axis=1`) if they don't align (`'outer'` for union, `'inner'` for intersection).
* `ignore_index`: If `True`, the resulting DataFrame will have a new, clean index.

### Concatenate DataFrames Vertically

**Example:** concatenate DataFrames vertically (row-wise: `axis=0` default):

In [35]:
# concatenate dataframes vertically
data_revenues = pd.concat([data_usa_revenues, data_china_revenues, data_japan_revenues],
                          ignore_index=True)

data_revenues

Unnamed: 0,company,revenues
0,Walmart,485873
1,Apple,215639
2,Exxon Mobil,205004
3,State Grid,315199
4,Sinopec Group,267518
5,China National Petroleum,262573
6,Toyota Motor,254694
7,Honda Motor,129198
8,Japan Post Holdings,122990


**Example:**

In [36]:
# concatenate dataframes vertically
data_profits = pd.concat([data_usa_profits, data_china_profits, data_japan_profits],
                         ignore_index=True)

data_profits

Unnamed: 0,company,profits
0,Walmart,13643.0
1,Apple,45687.0
2,Exxon Mobil,7840.0
3,State Grid,9571.3
4,Sinopec Group,1257.9
5,China National Petroleum,1867.5
6,Toyota Motor,16899.3
7,Honda Motor,5690.3
8,Japan Post Holdings,-267.4


### Concatenate DataFrames Horizontally

**Example:** concatenate DataFrames horizontally (column-wise: `axis=1`):

In [37]:
# concatenate dataframes horizontally
data_revenues_profits = pd.concat([data_revenues, data_profits], axis=1)

data_revenues_profits

Unnamed: 0,company,revenues,company.1,profits
0,Walmart,485873,Walmart,13643.0
1,Apple,215639,Apple,45687.0
2,Exxon Mobil,205004,Exxon Mobil,7840.0
3,State Grid,315199,State Grid,9571.3
4,Sinopec Group,267518,Sinopec Group,1257.9
5,China National Petroleum,262573,China National Petroleum,1867.5
6,Toyota Motor,254694,Toyota Motor,16899.3
7,Honda Motor,129198,Honda Motor,5690.3
8,Japan Post Holdings,122990,Japan Post Holdings,-267.4


## Merging DataFrames

Merging two DataFrames means combining them based on common columns or index levels.

It's used when you want to bring together related information from different DataFrames.

The following figure illustrates the merging operation:

course-pandas-merge.svg

We use the `merge()` method to concatenate DataFrames.

*Syntax:*

`pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False)`


*Parameters:*

* `left`, `right`: The DataFrames to merge.
* `how`: Specifies the type of join ('inner', 'left', 'right', 'outer').
* `on`: Column(s) to join on (if present in both DataFrames).
* `left_on`, `right_on`: Columns to join on in the left and right DataFrames respectively (if names differ).
* `left_index`, `right_index`: Use the index of the left or right DataFrame as the join key.
* Types of Merges (`how` parameter):
  * Inner Join: Returns only rows where the join key exists in both DataFrames.
  * Left Join: Returns all rows from the left DataFrame and matching rows from the right.
  * Right Join: Returns all rows from the right DataFrame and matching rows from the left.
  * Outer Join: Returns all rows from both DataFrames, filling in NaN for non-matching values.

### Merging DataFrames Using One Key

#### Making DataFrames

Make left Dataframe:

In [38]:
# make left dataframe
data_usjp_revenues = pd.concat([data_usa_revenues, data_japan_revenues], ignore_index=True)

data_usjp_revenues

Unnamed: 0,company,revenues
0,Walmart,485873
1,Apple,215639
2,Exxon Mobil,205004
3,Toyota Motor,254694
4,Honda Motor,129198
5,Japan Post Holdings,122990


Make right DataFrame:

In [39]:
# make right dataframe
data_cnjp_profits = pd.concat([data_china_profits, data_japan_profits], ignore_index=True)

data_cnjp_profits

Unnamed: 0,company,profits
0,State Grid,9571.3
1,Sinopec Group,1257.9
2,China National Petroleum,1867.5
3,Toyota Motor,16899.3
4,Honda Motor,5690.3
5,Japan Post Holdings,-267.4


#### Inner Join

**Example:** merge DataFrames with inner join:

In [40]:
# merge dataframes with inner join
data_merge_inner = pd.merge(data_usjp_revenues, data_cnjp_profits, on='company', how='inner')

data_merge_inner

Unnamed: 0,company,revenues,profits
0,Toyota Motor,254694,16899.3
1,Honda Motor,129198,5690.3
2,Japan Post Holdings,122990,-267.4


#### Left Join

**Example:** merge DataFrames with left join:

In [41]:
# merge dataframes with left join
data_merge_left = pd.merge(data_usjp_revenues, data_cnjp_profits, on='company', how='left')

data_merge_left

Unnamed: 0,company,revenues,profits
0,Walmart,485873,
1,Apple,215639,
2,Exxon Mobil,205004,
3,Toyota Motor,254694,16899.3
4,Honda Motor,129198,5690.3
5,Japan Post Holdings,122990,-267.4


#### Right Join

**Example:** merge DataFrames with right join:

In [42]:
# merge dataframes with right join
data_merge_right = pd.merge(data_usjp_revenues, data_cnjp_profits, on='company', how='right')

data_merge_right

Unnamed: 0,company,revenues,profits
0,State Grid,,9571.3
1,Sinopec Group,,1257.9
2,China National Petroleum,,1867.5
3,Toyota Motor,254694.0,16899.3
4,Honda Motor,129198.0,5690.3
5,Japan Post Holdings,122990.0,-267.4


#### Outer Join

**Example:** merge DataFrames with outer join:

In [43]:
# merge dataframes with outer join
data_merge_right = pd.merge(data_usjp_revenues, data_cnjp_profits, on='company', how='outer')

data_merge_right

Unnamed: 0,company,revenues,profits
0,Apple,215639.0,
1,China National Petroleum,,1867.5
2,Exxon Mobil,205004.0,
3,Honda Motor,129198.0,5690.3
4,Japan Post Holdings,122990.0,-267.4
5,Sinopec Group,,1257.9
6,State Grid,,9571.3
7,Toyota Motor,254694.0,16899.3
8,Walmart,485873.0,


## Joining DataFrames

The `join()` method is used to combine columns of two DataFrames based on their indexes.

It's a simple way of merging two DataFrames when the relationship between them is primarily based on their row indexes.

### Set indices

Set index of the left DataFrame:

In [44]:
# set index to company
data_usjp_revenues_indexed = data_usjp_revenues.set_index('company')

data_usjp_revenues_indexed

Unnamed: 0_level_0,revenues
company,Unnamed: 1_level_1
Walmart,485873
Apple,215639
Exxon Mobil,205004
Toyota Motor,254694
Honda Motor,129198
Japan Post Holdings,122990


Set index of the right DataFrame:

In [45]:
# set index to company
data_cnjp_profits_indexed = data_cnjp_profits.set_index('company')

data_cnjp_profits_indexed

Unnamed: 0_level_0,profits
company,Unnamed: 1_level_1
State Grid,9571.3
Sinopec Group,1257.9
China National Petroleum,1867.5
Toyota Motor,16899.3
Honda Motor,5690.3
Japan Post Holdings,-267.4


Inner Join

In [46]:
# join dataframes: inner
data_join_inner = data_usjp_revenues_indexed.join(data_cnjp_profits_indexed, how='inner')

data_join_inner

Unnamed: 0_level_0,revenues,profits
company,Unnamed: 1_level_1,Unnamed: 2_level_1
Toyota Motor,254694,16899.3
Honda Motor,129198,5690.3
Japan Post Holdings,122990,-267.4


### Left Join

In [47]:
# join dataframes: left
data_join_left = data_usjp_revenues_indexed.join(data_cnjp_profits_indexed, how='left')

data_join_left

Unnamed: 0_level_0,revenues,profits
company,Unnamed: 1_level_1,Unnamed: 2_level_1
Walmart,485873,
Apple,215639,
Exxon Mobil,205004,
Toyota Motor,254694,16899.3
Honda Motor,129198,5690.3
Japan Post Holdings,122990,-267.4


### Right Join

In [48]:
# join dataframes: right
data_join_right = data_usjp_revenues_indexed.join(data_cnjp_profits_indexed, how='right')

data_join_right

Unnamed: 0_level_0,revenues,profits
company,Unnamed: 1_level_1,Unnamed: 2_level_1
State Grid,,9571.3
Sinopec Group,,1257.9
China National Petroleum,,1867.5
Toyota Motor,254694.0,16899.3
Honda Motor,129198.0,5690.3
Japan Post Holdings,122990.0,-267.4


### Outer Join

In [49]:
# join dataframes: outer
data_join_outer = data_usjp_revenues_indexed.join(data_cnjp_profits_indexed, how='outer')

data_join_outer

Unnamed: 0_level_0,revenues,profits
company,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,215639.0,
China National Petroleum,,1867.5
Exxon Mobil,205004.0,
Honda Motor,129198.0,5690.3
Japan Post Holdings,122990.0,-267.4
Sinopec Group,,1257.9
State Grid,,9571.3
Toyota Motor,254694.0,16899.3
Walmart,485873.0,
