# __Transformation of Data__

In [11]:
import numpy as np
import pandas as pd

In [12]:
df = pd.read_csv(r'../Datasets/Tips100.csv')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,18.69,2.31,Male,No,Sat,Dinner,3,6.23,Brandon Bradley,4427601595688633,Sat4056
1,17.29,2.71,Male,No,Thur,Lunch,2,8.64,Brian Diaz,4759290988169738,Thur9501
2,11.87,1.63,Female,No,Thur,Lunch,2,5.94,Annette Cunningham,675937746864,Thur4780
3,10.65,1.50,Female,No,Thur,Lunch,2,5.32,Linda Zhang,3560509622598239,Thur9593
4,9.78,1.73,Male,No,Thur,Lunch,2,4.89,David Stewart,3578014604116399,Thur7276
...,...,...,...,...,...,...,...,...,...,...,...
95,26.41,1.50,Female,No,Sat,Dinner,2,13.20,Melody Simon,4745394421258160,Sat8980
96,17.26,2.74,Male,No,Sun,Dinner,3,5.75,Gregory Smith,4292362333741,Sun5205
97,13.16,2.75,Female,No,Thur,Lunch,2,6.58,Lindsey Meyer,676239597203,Thur6245
98,21.16,3.00,Male,No,Thur,Lunch,2,10.58,Keith Lewis,4356005144080422,Thur6273


__assign()__

Purpose: Add or modify columns in a DataFrame.

Works with: DataFrames.

Returns: A new DataFrame with the added or modified columns. It doesn’t change the original DataFrame unless reassigned.

In [13]:
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Add a new column C based on column A
df_new = df1.assign(C=df1['A'] * 2)

print(df_new)

   A  B  C
0  1  4  2
1  2  5  4
2  3  6  6


__Task:__ 

How to create a new column called `tip_perc` showing tips as a percentage of the total bill, and another column `tip_per_person`.

You can use the `assign` method to create multiple columns in a dataframe in one shot. 

It accepts a lambda function, which takes the _entire dataframe_ as an argument.

In [14]:
df = df.assign(tip_perc = lambda x: round(x['tip']/x['total_bill'], 4),
              tip_per_person = lambda x: round(x['tip']/x['size'], 4))
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_perc,tip_per_person
0,18.69,2.31,Male,No,Sat,Dinner,3,6.23,Brandon Bradley,4427601595688633,Sat4056,0.1236,0.77
1,17.29,2.71,Male,No,Thur,Lunch,2,8.64,Brian Diaz,4759290988169738,Thur9501,0.1567,1.355
2,11.87,1.63,Female,No,Thur,Lunch,2,5.94,Annette Cunningham,675937746864,Thur4780,0.1373,0.815
3,10.65,1.5,Female,No,Thur,Lunch,2,5.32,Linda Zhang,3560509622598239,Thur9593,0.1408,0.75
4,9.78,1.73,Male,No,Thur,Lunch,2,4.89,David Stewart,3578014604116399,Thur7276,0.1769,0.865


In [15]:
df['time']

0     Dinner
1      Lunch
2      Lunch
3      Lunch
4      Lunch
       ...  
95    Dinner
96    Dinner
97     Lunch
98     Lunch
99    Dinner
Name: time, Length: 100, dtype: object

In [16]:
# df.eval('tip_per_person= tip/size',inplace=True)
# df.eval('tip_perc= tip/total_bill',inplace=True)
# df.head() Same as above

# __map()__ 

Purpose: Apply a function element-wise to a Series.

Works with: Series only.

Returns: A new Series after applying the function.

In [33]:

s = pd.Series([1, 2, 3, 4])

s_mapped = s.map(lambda x: x * 2)

print(s_mapped)


0    2
1    4
2    6
3    8
dtype: int64


In [34]:
s = pd.Series(['cat', 'dog', 'fish'])
s_mapped = s.map({'cat': 'kitten', 'dog': 'puppy'})
print(s_mapped)


0    kitten
1     puppy
2       NaN
dtype: object


__Task:__ 

Encode the `day` variable to show the number of the week day, rather than an abbreviation.

The map function is used to transform every item in a column based on a `lambda function logic` or a `dict` lookup.

Note: `DataFrame` does not have a `map` method. Only a `Series` does.

__Example 1:__ dict based map.

Number code the days of the week based on the dict `weekdays`.

In [17]:
weekdays = {'Mon':1, 'Tue': 2, 'Wed': 3, 'Thur': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}

In [18]:
df['day'] = df['day'].map(weekdays)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_perc,tip_per_person
0,18.69,2.31,Male,No,6,Dinner,3,6.23,Brandon Bradley,4427601595688633,Sat4056,0.1236,0.77
1,17.29,2.71,Male,No,4,Lunch,2,8.64,Brian Diaz,4759290988169738,Thur9501,0.1567,1.355
2,11.87,1.63,Female,No,4,Lunch,2,5.94,Annette Cunningham,675937746864,Thur4780,0.1373,0.815
3,10.65,1.5,Female,No,4,Lunch,2,5.32,Linda Zhang,3560509622598239,Thur9593,0.1408,0.75
4,9.78,1.73,Male,No,4,Lunch,2,4.89,David Stewart,3578014604116399,Thur7276,0.1769,0.865


In [19]:
time = {'Lunch': 1, 'Dinner': 2}
df['time'] = df['time'].map(time)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_perc,tip_per_person
0,18.69,2.31,Male,No,6,2,3,6.23,Brandon Bradley,4427601595688633,Sat4056,0.1236,0.77
1,17.29,2.71,Male,No,4,1,2,8.64,Brian Diaz,4759290988169738,Thur9501,0.1567,1.355
2,11.87,1.63,Female,No,4,1,2,5.94,Annette Cunningham,675937746864,Thur4780,0.1373,0.815
3,10.65,1.5,Female,No,4,1,2,5.32,Linda Zhang,3560509622598239,Thur9593,0.1408,0.75
4,9.78,1.73,Male,No,4,1,2,4.89,David Stewart,3578014604116399,Thur7276,0.1769,0.865


__Example 2:__ Lambda function based map

__Task__: Mask the CC number and show only the last 4 digits.

In [20]:
df['CC Number'].map(lambda x: "XXXX"+str(x)[-4:])

0     XXXX8633
1     XXXX9738
2     XXXX6864
3     XXXX8239
4     XXXX6399
        ...   
95    XXXX8160
96    XXXX3741
97    XXXX7203
98    XXXX0422
99    XXXX7571
Name: CC Number, Length: 100, dtype: object

If length != 16, show 'xxxx'

In [21]:
df['CC Number'] = df['CC Number'].map(lambda x: "x"+str(x)[-4:] if len(str(x))==16 else 'xxxx')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_perc,tip_per_person
0,18.69,2.31,Male,No,6,2,3,6.23,Brandon Bradley,x8633,Sat4056,0.1236,0.77
1,17.29,2.71,Male,No,4,1,2,8.64,Brian Diaz,x9738,Thur9501,0.1567,1.355
2,11.87,1.63,Female,No,4,1,2,5.94,Annette Cunningham,xxxx,Thur4780,0.1373,0.815
3,10.65,1.5,Female,No,4,1,2,5.32,Linda Zhang,x8239,Thur9593,0.1408,0.75
4,9.78,1.73,Male,No,4,1,2,4.89,David Stewart,x6399,Thur7276,0.1769,0.865


__applymap()__

`applymap()` is the `DataFrame` version of a `map`. It applies a logic for transformation to every element of a DataFrame.

__Task:__ For all alphabetic cells, replace value with the number of characters in the cell.

Purpose: Apply a function element-wise to all elements of a DataFrame.

Works with: DataFrames only.

Returns: A new DataFrame with the function applied to each element.

In [37]:
df = df.apply(lambda col: col.map(lambda x: str(x).upper()[0] if str(x).isalpha() else x))
df



Unnamed: 0,Date,BUDAPEST,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,...,JASZ,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA
0,03/01/2005,168,79,30,173,169,42,136,120,162,...,130,57,2,178,66,64,11,29,87,68
1,10/01/2005,157,60,30,92,200,53,51,70,84,...,80,50,29,141,48,29,58,53,68,26
2,17/01/2005,96,44,31,86,93,30,93,84,191,...,64,46,4,157,33,33,24,18,62,44
3,24/01/2005,163,49,43,126,46,39,52,114,107,...,63,54,14,107,66,50,25,21,43,31
4,31/01/2005,122,78,53,87,103,34,95,131,172,...,61,49,11,124,63,56,7,47,85,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,01/12/2014,95,12,41,6,39,0,16,15,14,...,56,7,13,122,4,23,4,11,110,10
518,08/12/2014,43,39,31,10,34,3,2,30,25,...,34,20,18,70,36,5,23,22,63,9
519,15/12/2014,35,7,15,0,0,0,7,7,4,...,30,36,4,72,5,21,14,0,17,10
520,22/12/2014,30,23,8,0,11,4,1,9,10,...,27,17,21,12,5,17,1,1,83,2


The `Payer Name` is untouched because of the space character. So, remove space and try again.

__Challange__

In [24]:
## to remove all numbers from a string an show only character part in Payment ID column

In [25]:
def remove_digits(s):
    return ''.join([i for i in s if not i.isdigit()])

In [26]:
df['Payment_Day'] = df['Payment ID'].map(remove_digits)
df['Payment_Day'][:5]

0     Sat
1    Thur
2    Thur
3    Thur
4    Thur
Name: Payment_Day, dtype: object

__Apply a function Row wise or Column Wise__

Applies a function along an axis (rows or columns).

Works on both Series and DataFrames:

For Series, it applies the function to each element.

For DataFrames, it applies the function to rows or columns, depending on the axis.

Flexible input: Can take any Python function, lambda functions, or NumPy functions.

__Syntax__

### For a Series
Series.apply(func)

### For a DataFrame
DataFrame.apply(func, axis=0)  # axis=0 applies the function to columns (default)

DataFrame.apply(func, axis=1)  # axis=1 applies the function to rows


In [27]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv(r'../Datasets/hungary_chickenpox.csv')
df

Unnamed: 0,Date,BUDAPEST,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,...,JASZ,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA
0,03/01/2005,168,79,30,173,169,42,136,120,162,...,130,57,2,178,66,64,11,29,87,68
1,10/01/2005,157,60,30,92,200,53,51,70,84,...,80,50,29,141,48,29,58,53,68,26
2,17/01/2005,96,44,31,86,93,30,93,84,191,...,64,46,4,157,33,33,24,18,62,44
3,24/01/2005,163,49,43,126,46,39,52,114,107,...,63,54,14,107,66,50,25,21,43,31
4,31/01/2005,122,78,53,87,103,34,95,131,172,...,61,49,11,124,63,56,7,47,85,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,01/12/2014,95,12,41,6,39,0,16,15,14,...,56,7,13,122,4,23,4,11,110,10
518,08/12/2014,43,39,31,10,34,3,2,30,25,...,34,20,18,70,36,5,23,22,63,9
519,15/12/2014,35,7,15,0,0,0,7,7,4,...,30,36,4,72,5,21,14,0,17,10
520,22/12/2014,30,23,8,0,11,4,1,9,10,...,27,17,21,12,5,17,1,1,83,2


__Example 1:__ 

Create a new column that contains the the highest number of Chickenpox cases for any region.

In [29]:
Max_cases = df.drop(columns='Date').apply(np.max,axis='columns')
Max_cases

0      178
1      200
2      191
3      163
4      172
      ... 
517    122
518     70
519     72
520     83
521    259
Length: 522, dtype: int64

__With For Loop__

In [30]:
# With For Loop
max_values = []
for i, row in enumerate(df.iterrows()):
        max_values.append(np.max(row[1][1:]))

In [31]:
# Check
max_values[:5]    

[178, 200, 191, 163, 172]

__Example 2:__ 

What is the median number of Chickepox cases for every region (column)

In [32]:
df.drop(columns='Date').apply(np.median,axis='columns')

0      73.5
1      55.5
2      48.5
3      49.5
4      62.0
       ... 
517    13.5
518    24.0
519     7.0
520    10.5
521    43.5
Length: 522, dtype: float64