# 1. Import & show version

In [1]:
import numpy as np
import pandas as pd
print(pd.__version__)

0.24.2


# 2. Create a series from a list, numpy array and dict?

In [10]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# pd.Series(mylist)
# pd.Series(myarr)
pd.Series(mydict).head()

a    0
b    1
c    2
e    3
d    4
dtype: int64

# 3. Convert the index of a series into a column of a dataframe

In [17]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# 直接用 pd.DataFrame 去做表
pd.DataFrame(ser).reset_index().head()

# to_frame
ser.to_frame().reset_index().head()

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


# 4. Combine many series to form a dataframe

In [98]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# pd.concat
pd.concat([ser1, ser2], axis=1)

# pd.DataFrame 直接做表
pd.DataFrame({'col1': ser1, 'col2': ser2}).head()

Unnamed: 0,col1,col2
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


# 5. Assign name to the series’ index
給 series name 用意是在於當他被合併成 dataframe 的時候，name 就會變成 column name

In [38]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser.name = "aaa"
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: aaa, dtype: object

# 6. Get the items of series A not present in series B
series.isin()

In [63]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

# 7. Get the items not common to both series A and series B

In [74]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# making an intersection series
inter = np.intersect1d(ser1, ser2)

# concating two series
ser_all = pd.concat([ser1, ser2])

# getting the itens not common
ser_all[~ser_all.isin(inter)]

0    1
1    2
2    3
2    6
3    7
4    8
dtype: int64

# 8. Get the minimum, 25th percentile, median, 75th, and max of a numeric series

In [93]:
ser = pd.Series(np.random.normal(10, 5, 25))
stat = ser.describe()

print('min:', stat.loc['min'])
print('25th percentile:', stat.loc['25%'])
print('50th percentile(median):', stat.loc['50%'])
print('75th percentile:', stat.loc['75%'])
print('max:', stat.loc['max'])

min: -3.729368201665988
25th percentile: 9.027979401636898
50th percentile(median): 11.543524061788856
75th percentile: 14.717336790130318
max: 19.335043465676648


In [94]:
# 或者是直接用 np.percentile 抓 0 ~ 100
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([-3.7293682 ,  9.0279794 , 11.54352406, 14.71733679, 19.33504347])

# 9. Get frequency counts of unique items of a series
value_counts()

In [97]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser.value_counts()

b    7
e    6
g    5
d    5
h    3
f    2
c    1
a    1
dtype: int64

# 10. Keep only top 2 most frequent values as it is and replace everything else as ‘Other’

value_counts 算 freq，用 index 抓值

In [132]:
np.random.seed(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

ser[~ser.isin(ser.value_counts().index[0:2])] = 'Other'
ser

0         1
1         1
2         4
3         4
4         4
5         4
6         1
7     Other
8     Other
9         1
10    Other
11    Other
dtype: object

# 11. bin a numeric series to 10 groups of equal size

cut & qcut

cut 是指定數字分割；qcut 則是用百分位數分割，兩者皆是建一個新的 series，用給定的 labels 替換掉原值

In [133]:
# Bin the series ser into 10 equal deciles and replace the values with the bin name.
ser = pd.Series(np.random.random(20))

# Desired Output
# First 5 items
# 0    7th
# 1    9th
# 2    7th
# 3    3rd
# 4    8th

In [148]:
bins = np.arange(start=0, stop=1.1, step=0.1)
group_name = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']

pd.qcut(ser, q=bins, labels=group_name).head()

0    7th
1    9th
2    1st
3    6th
4    9th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

# 12. Convert a numpy array to a dataframe of given shape
Reshape the series ser into a dataframe with 7 rows and 5 columns

In [171]:
ser = pd.Series(np.random.randint(1, 10, 35))

# My solution: 先轉 np.array，再用 reshape
pd.DataFrame(np.array(ser).reshape(7,5))

# 可以用 .values 將 series 轉 array，pandas 也有 reshape
pd.DataFrame(ser.values.reshape(7,5))

Unnamed: 0,0,1,2,3,4
0,5,3,8,2,4
1,4,7,9,6,2
2,2,2,7,7,4
3,4,2,6,3,7
4,8,6,7,4,9
5,9,4,5,3,8
6,6,1,1,1,2


# 13. Find the positions of numbers that are multiples of 3 from a series

In [173]:
ser = pd.Series(np.random.randint(1, 10, 7))
ser

0    2
1    8
2    2
3    4
4    4
5    6
6    5
dtype: int32

In [187]:
np.argwhere(ser % 3 == 0)

array([[5]], dtype=int64)

# 14. Extract items at given positions from a series

In [190]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]
ser[pos]

0     a
4     e
8     i
14    o
20    u
dtype: object

# 15. Stack two series vertically and horizontally 

In [197]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

pd.concat([ser1, ser2], axis=0)
pd.concat([ser1, ser2], axis=1)

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


# 16. Get the positions of items of series A in another series B

In [233]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# 初步想法是直接抓出位置
np.where(ser1.isin(ser2))

# 但題目是更精準地指出 ser2 中 1 -> 對應到 ser1 的哪個位置；3 -> 對應到 ser1 的哪個位置 ... 等
pos = [np.where(ser1 == i) for i in ser2] # 得到結果
np.concatenate(pos).ravel().tolist() # 轉成 list

[5, 4, 0, 8]

# 17. Compute the mean squared error on a truth and predicted series

In [251]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

# MSE
((pred-truth)**2).mean()

0.42517726319354

# 18. Convert the first character of each element in a series to uppercase

In [262]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser.str.capitalize()

# 這一題有很多方法，我用 pandas.series 內建 str.capitalize() 可以直接達成
# 也可以寫 .map(lambda) 用 lambda function 配上一般像是 .title() .upper() 這種對 list 進行操作的函數達成
# 或是直接用 list comprehension 做再轉回 series

0     How
1      To
2    Kick
3    Ass?
dtype: object

# 19. Calculate the number of characters in each word in a series

In [274]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# list comprehension
[len(i) for i in ser]

# lambda function
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

# 20. Compute difference of differences between consequtive numbers of a series

In [275]:
#Input
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# Desired Output
# [nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
# [nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]

In [279]:
ser.diff(periods=1)

# diff 的 diff
# ser.diff().diff()

0    NaN
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    6.0
7    8.0
dtype: float64

# 21. Convert a series of date-strings to a timeseries?

In [290]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

# 22. Get the day of month, week number, day of year and day of week from a series of date strings

In [323]:
# Input
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
ser = pd.to_datetime(ser)
# Desired output

# Date:  [1, 2, 3, 4, 5, 6]
ser.dt.day
      
# Week number:  [53, 5, 9, 14, 19, 23]
ser.dt.weekofyear

# Day num of year:  [1, 33, 63, 94, 125, 157]
ser.dt.dayofyear

# Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']
ser.dt.weekday_name

0       Friday
1    Wednesday
2     Saturday
3     Thursday
4       Monday
5     Saturday
dtype: object

# 23. Convert year-month string to dates corresponding to the 4th day of the month

In [338]:
# Input
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

# Desired Output

# 0   2010-01-04
# 1   2011-02-04
# 2   2012-03-04
# dtype: datetime64[ns]
ser

0    Jan 2010
1    Feb 2011
2    Mar 2012
dtype: object

In [349]:
# Series.dt.strftime(*args, **kwargs) 可以把 dt object 按照指定格式 format

pd.to_datetime( ser.map(lambda x: pd.to_datetime(x).strftime('%Y-%m-04')) )

# pd.to_datetime(ser)[0].strftime('%Y-%m-04')

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

# 24. Filter words that contain atleast 2 vowels from a series

In [28]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

In [30]:
def is_more_than_2_vowels(x):
    vowels = list('aeiou')
    return(len([i for i in list(x) if i.lower() in vowels]) > 1)

ser[ser.map(lambda x: is_more_than_2_vowels(x))]

0     Apple
1    Orange
4     Money
dtype: object

# 25. Filter valid emails from a series
Series.str.contains()

In [37]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

In [51]:
# Series.str.contains(pat, case=True, flags=0, na=None, regex=True)
emails[emails.str.contains(pattern, regex=True)]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

# 26. Get the mean of a series grouped by another series

In [3]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))


In [16]:
# My solution: 先把 series 合併成 df 再計算
df = pd.concat([fruit, weights], axis=1)
df.columns = ['fruit_name', 'weights']
df.groupby('fruit_name')['weights'].mean()

# 但其實不一定要把 series 併成 df 再做 aggregation
weights.groupby(fruit).mean()

apple     6.00
banana    3.75
carrot    7.00
dtype: float64

# 27. Compute the euclidean distance between two series

In [27]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# 直接計算
np.sqrt(sum((p-q)**2))

# numpy function 
np.linalg.norm(p-q)

18.16590212458495

# 28. Find all the local maxima (or peaks) in a numeric series
Get the positions of peaks (values surrounded by smaller values on both sides) in ser.

In [28]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# Desired output

# array([1, 5, 7])

In [71]:
# 用 doubld diff 的概念 (詳 numpy practice)
double_diff = np.diff(np.sign(np.diff(ser)))
np.where(double_diff == -2)[0] + 1

array([1, 5, 7], dtype=int64)

# 29. Replace missing spaces in a string with the least frequent character

In [102]:
my_str = 'dbc deb abed gade'

# Desired Output

# 'dbccdebcabedcgade'  # least frequent is 'c'

In [103]:
replacement = pd.value_counts(list(my_str)).dropna().index[-1]

my_str.replace(' ', replacement)

'dbcgdebgabedggade'

# 30. Create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values

In [110]:
# My solution: 先做一個 2000-01-01 ~ 10 個禮拜後的 ndarray，先知道 end_date 再用 np.arange 處理
end_date = np.datetime64('2000-01-01') + 70
pd.Series(np.random.randint(0, 10, 10), index=np.arange('2000-01-01', end_date, 7))

# 或是用 pd.date_range() function 可以直接照需求做 series
# pd.date_range('2000-01-01', periods=10, freq='W-SAT')

2000-01-01    7
2000-01-08    4
2000-01-15    2
2000-01-22    0
2000-01-29    6
2000-02-05    2
2000-02-12    7
2000-02-19    2
2000-02-26    2
2000-03-04    2
dtype: int32

# 31. Fill an intermittent time series so all missing dates show up with values of previous non-missing date

In [112]:
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime(['2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))

# Desired Output

# 2000-01-01     1.0
# 2000-01-02     1.0
# 2000-01-03    10.0
# 2000-01-04    10.0
# 2000-01-05    10.0
# 2000-01-06     3.0
# 2000-01-07     3.0
# 2000-01-08     NaN

In [135]:
# 用 reindex 是告訴原本的 series 應該要有哪些 index，而消失的值要怎麼補
index = pd.date_range(ser.index[0], ser.index[-1])
ser.reindex(index= index, method='ffill')

# 官方解
ser.resample('D').ffill()

2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64

# 32. Compute the autocorrelations of a numeric series

In [146]:
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))

autocorrelations = [ser.autocorr(i).round(2) for i in range(11)]
print(autocorrelations[1:])
print('Lag having highest correlation: ', np.argmax(np.abs(autocorrelations[1:]))+1)

[0.29, -0.03, -0.05, 0.25, 0.43, 0.11, -0.18, -0.05, 0.77, 0.36]
Lag having highest correlation:  9


# 33. Import only every nth row from a csv file to create a dataframe

In [147]:
# read by chunks
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)
df2 = pd.DataFrame()

In [150]:
# 每個 chunk 再挑第一列
for chunk in df:
    df2 = df2.append(chunk.iloc[0,:])

# 34. Change column values when importing csv to a dataframe

In [159]:
# use converter
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})

# 35. Create a dataframe with rows as strides from a given series

In [189]:
L = pd.Series(range(15))
np.array(L)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [206]:
def stride_series(s, window, stride):
    maximum = len(L) - window
    temp = []
    i = 0
    while i < maximum:
        temp.append(L[i:window+i])
        i += stride
    return(np.array(temp))

stride_series(L, 5, 3)

array([[ 0,  1,  2,  3,  4],
       [ 3,  4,  5,  6,  7],
       [ 6,  7,  8,  9, 10],
       [ 9, 10, 11, 12, 13]], dtype=int64)

# 36. Import only specified columns from a csv file

In [208]:
pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', usecols=['crim', 'medv']).head()

Unnamed: 0,crim,medv
0,0.00632,24.0
1,0.02731,21.6
2,0.02729,34.7
3,0.03237,33.4
4,0.06905,36.2


# 37. Get the nrows, ncolumns, datatype, summary stats of each column of a dataframe? Also get the array and list equivalent

In [209]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [221]:
# nrows & ncols
df.shape

# datatype of each column
df.dtypes

# how many columns under each dtype
df.get_dtype_counts()
df.dtypes.value_counts()

# summary statistics
df.describe()

# numpy array 
df_arr = df.values

# list
df_list = df.values.tolist()

# 38. Extract the row and column number of a particular cell with given criterion

In [227]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Which manufacturer, model and type has the highest Price? 
# What is the row and column number of the cell with the highest Price value?
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,44.6,19.0,26.0,Driver & Passenger,,...,6.0,193.0,106.0,,37.0,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,,22.0,30.0,,Rear,...,4.0,186.0,109.0,69.0,39.0,27.0,13.0,3640.0,non-USA,BMW 535i


In [252]:
idx = df['Price'] == df['Price'].max()

df.iloc[np.where(idx)[0][0]].loc[['Manufacturer', 'Model', 'Type']]

Manufacturer    Mercedes-Benz
Model                    300E
Type                  Midsize
Name: 58, dtype: object

In [269]:
df.loc[idx, ['Manufacturer', 'Model', 'Type', 'Price']]

Unnamed: 0,Manufacturer,Model,Type,Price
58,Mercedes-Benz,300E,Midsize,61.9


# 39. Rename a specific columns in a dataframe
Rename the column Type as CarType in df and replace the ‘.’ in column names with ‘_’.

In [273]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
print(df.columns)

Index(['Manufacturer', 'Model', 'Type', 'Min.Price', 'Price', 'Max.Price',
       'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Man.trans.avail',
       'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Weight', 'Origin',
       'Make'],
      dtype='object')


In [286]:
# .rename 後會回傳一整個 df
df = df.rename(columns={'Type': 'CarType'})

# 全部的 . 換成 _
df.columns = [i.replace('.', '_') for i in df.columns]

# 40. Check if a dataframe has any missing values

In [321]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.isnull().values.any()

True

# 41. Count the number of missing values in each column

In [352]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# count each column missing values
count_missing_values = df.apply(lambda x: sum(x.isnull()), axis=0)

# return the colname of the maximum number of missing values
count_missing_values.idxmax()

'Luggage.room'

# 42. Replace missing values of multiple numeric columns with the mean

In [146]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [147]:
for i in df.columns:
    if df[i].dtype == 'float64':
        avg = df[i].mean()
        df[i].fillna(value= avg, inplace=True) 
        #df[i][df[i].isnull()] = avg
        
# 如果是想要對多個欄位一次操作
d = {'Min.Price': np.nanmean, 'Max.Price': np.nanmedian} # 先建字典

df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x, d: x.fillna(d[x.name](x)), args=(d, ))

# 43. Select a specific column from a dataframe as a dataframe instead of a series

In [150]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [170]:
# 這樣是 series
df['a'] 
df.a
df.loc[:,'a']
df.iloc[:, 0]

# 這樣是 dataframe：差別在於有沒有用 [] 包 index
df[['a']]
df.loc[:, ['a']]
df.iloc[:, [0]]

Unnamed: 0,a
0,0
1,5
2,10
3,15


# 44. Change the order of columns of a dataframe

In [171]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))

# 1. In df, interchange columns 'a' and 'c'.
# 2. Create a generic function to interchange two columns, without hardcoding column names.
# 3. Sort the columns in reverse alphabetical order, that is colume 'e' first through column 'a' last.

In [214]:
# Q2 answer
def column_swap(df, col1, col2):
    colnames = df.columns.tolist()
    idx1, idx2 = colnames.index(col1), colnames.index(col2)
    colnames[idx1], colnames[idx2] = colnames[idx2], colnames[idx1]
    return(df[colnames])

column_swap(df, 'a', 'b')

Unnamed: 0,b,a,c,d,e
0,1,0,2,3,4
1,6,5,7,8,9
2,11,10,12,13,14
3,16,15,17,18,19


In [209]:
# Q3 answer
df[sorted(df.columns, reverse=True)]

Unnamed: 0,e,d,c,b,a
0,4,3,2,1,0
1,9,8,7,6,5
2,14,13,12,11,10
3,19,18,17,16,15


# 45. Set the number of rows and columns displayed in the output

In [216]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Solution
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

df

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,...,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,...,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,...,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,...,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,...,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,...,27.0,13.0,3640.0,non-USA,BMW 535i
...,...,...,...,...,...,...,...,...,...,...,...
88,Volkswagen,Eurovan,Van,16.6,19.7,...,34.0,,3960.0,,Volkswagen Eurovan
89,Volkswagen,Passat,Compact,17.6,20.0,...,31.5,14.0,2985.0,non-USA,Volkswagen Passat
90,Volkswagen,Corrado,Sporty,22.9,23.3,...,26.0,15.0,2810.0,non-USA,Volkswagen Corrado
91,Volvo,240,Compact,21.8,22.7,...,29.5,14.0,2985.0,non-USA,Volvo 240


# 46. Format or suppress scientific notations in a pandas dataframe

In [217]:
df = pd.DataFrame(np.random.random(4)**10, columns=['random'])
df

Unnamed: 0,random
0,0.0007682111
1,1.495459e-10
2,0.101789
3,0.001775923


In [218]:
# Solution 1: Rounding
df.round(4)

# Solution 2: Use apply to change format
df.apply(lambda x: '%.4f' % x, axis=1)

# Solution 3: Use set_option
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Solution 4: Assign display.float_format
pd.options.display.float_format = '{:.4f}'.format
print(df)

# Reset/undo float formatting
pd.options.display.float_format = None

   random
0  0.0008
1  0.0000
2  0.1018
3  0.0018


# 47. Format all the values in a dataframe as percentages

In [261]:
df = pd.DataFrame(np.random.random(4), columns=['random'])

# pd.style.format 
df.style.format(formatter={
    'random': '{:.2%}'
})

# 用基本 str.format() 加上 list comprehension 把原本的欄位值替換掉
df['random'] = ['{:.2%}'.format(x) for x in df['random']]
df

Unnamed: 0,random
0,7.92%
1,14.65%
2,88.97%
3,92.64%


# 48. Filter every nth row in a dataframe

In [266]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# From df, filter the 'Manufacturer', 'Model' and 'Type' for every 20th row starting from 1st (row 0).
df.loc[::20, ['Manufacturer', 'Model', 'Type']]

# 49. Create a primary key index by combining relevant columns

In [373]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', usecols=[0,1,2,3,5])

# 先把 NaN 補成 missing
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna(value='missing')

# df.index 換成組合欄位
df.index = pd.Index(df['Manufacturer'] + '_' + df['Model'] + '_' + df['Type'])
df

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
Acura_Integra_Small,Acura,Integra,Small,12.9,18.8
missing_Legend_Midsize,missing,Legend,Midsize,29.2,38.7
Audi_90_Compact,Audi,90,Compact,25.9,32.3
Audi_100_Midsize,Audi,100,Midsize,,44.6
BMW_535i_Midsize,BMW,535i,Midsize,,
...,...,...,...,...,...
Volkswagen_Eurovan_Van,Volkswagen,Eurovan,Van,16.6,22.7
Volkswagen_Passat_Compact,Volkswagen,Passat,Compact,17.6,22.4
Volkswagen_Corrado_Sporty,Volkswagen,Corrado,Sporty,22.9,23.7
Volvo_240_Compact,Volvo,240,Compact,21.8,23.5


# 50. Get the row number of the nth largest value in a column

In [406]:
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))
print(df)

# Find the row position of the 5th largest value of column 'a' in df.
def find_nth_position(df, col, nth):
    return(df[col].argsort()[::-1][nth])

find_nth_position(df, 'a', 5)

    a   b   c
0  26   9  24
1  10  14   7
2  22   2  28
3  23  13  12
4   3  25  26
5   3  24  14
6  16   4  15
7  22   4   4
8   2  14  28
9   2  11  22


6

# 51. Find the position of the nth largest value greater than a given value
!!! none-sense problem

In [428]:
ser = pd.Series(np.random.randint(1, 100, 10))
print('ser: ', ser.tolist(), 'mean: ', round(ser.mean()))
np.argwhere(ser > ser.mean())[1]

ser:  [84, 94, 58, 70, 87, 19, 34, 39, 14, 84] mean:  58


array([1], dtype=int64)

# 52. Get the last n rows of a dataframe with row sum > 100

In [464]:
np.random.seed(100)
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))

# 先抓 row sum > 100 的 index 出來
pos = df.apply(lambda x: sum(x), axis=1) > 100

# 取出 df，再抓倒數兩列
df[pos].iloc[-2:]

Unnamed: 0,0,1,2,3
11,11,39,24,33
14,31,37,34,24


# 53. Find and cap outliers from a series or dataframe column

In [502]:
ser = pd.Series(np.logspace(-2, 2, 30))

# Replace all values of ser in the lower 5%ile and greater than 95%ile with respective 5th and 95th %ile value.

# step 1: get 5th & 95th percentile values 
low, high = ser.quantile(q = [0.05, 0.95])

# step 2: get position
low_pos = ser < low
high_pos = ser > high

# step 3: assign values
ser[low_pos] = '5th'
ser[high_pos] = '95th'


# 54. Reshape a dataframe to the largest possible square after removing the negative values

In [505]:
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,10,47,8,17,-3,-10,-7,5,46,-5
1,-9,18,2,35,37,31,28,33,-3,45
2,22,24,-4,31,34,35,22,41,17,-18
3,42,-4,33,27,28,-11,29,33,14,-17
4,-9,29,9,16,47,6,32,16,11,48
5,41,16,22,-7,28,12,18,20,33,-6
6,18,17,-20,39,21,24,-1,46,-7,45
7,48,34,-20,-10,45,-18,19,16,15,6
8,-1,-16,19,32,24,-7,13,-5,-3,3
9,39,-12,20,27,25,5,11,-10,18,40


In [513]:
# 把 df 拉直 + 移除小於 0 的值
ddd = df.values.ravel()
ddd = ddd[ddd > 0]

# 計算最大正方形邊長
side = int(ddd.shape[0] ** 0.5)

# 把拉直後的 array 由大到小排序，取製作正方形所需的所有值，reshape 成正方形
ddd.argsort()[::-1][0:side**2].reshape(side, -1)

# 55. Swap two rows of a dataframe

In [554]:
df = pd.DataFrame(np.arange(25).reshape(5, -1))

def swap_rows(df, i1, i2):
    a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

swap_rows(df, 1, 2)

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,10,11,12,13,14
2,5,6,7,8,9
3,15,16,17,18,19
4,20,21,22,23,24


# 56. Reverse the rows of a dataframe

In [557]:
df = pd.DataFrame(np.arange(25).reshape(5, -1))

df[::-1]

Unnamed: 0,0,1,2,3,4
4,20,21,22,23,24
3,15,16,17,18,19
2,10,11,12,13,14
1,5,6,7,8,9
0,0,1,2,3,4


# 57. Create one-hot encodings of a categorical variable (dummy variables)

In [578]:
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))

# get one hot df
one_hot = pd.get_dummies(df['a'])

# drop original column
df.drop('a', axis=1, inplace=True)

# concat df
df = pd.concat([one_hot, df], axis=1)

df

Unnamed: 0,0,5,10,15,20,b,c,d,e
0,1,0,0,0,0,1,2,3,4
1,0,1,0,0,0,6,7,8,9
2,0,0,1,0,0,11,12,13,14
3,0,0,0,1,0,16,17,18,19
4,0,0,0,0,1,21,22,23,24


# 58. Which column contains the highest number of row-wise maximum values

In [607]:
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))

df.idxmax(axis=1).value_counts().index[0]

2

# 59. Create a new column that contains the row number of nearest column by euclidean distance

In [685]:
np.random.seed(10)
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1), columns=list('pqrs'), index=list('abcdefghij'))
df

Unnamed: 0,p,q,r,s
a,10,16,65,29
b,90,94,30,9
c,74,1,41,37
d,17,12,55,89
e,63,34,73,79
f,50,52,55,78
g,70,14,26,14
h,93,87,31,31
i,90,13,66,32
j,58,37,28,19


In [687]:
nearest_row = pd.Series(index= df.index, name='nearest_row')
dist = pd.Series(index= df.index, name='nearest_row')

for i in df.index:
    euc = df.drop(i).apply(lambda x: np.linalg.norm(x-df.loc[i]), axis=1)
    nearest_row[i] = euc.idxmin()
    dist[i] = euc.min()
    
df['nearest_row'] = nearest_row
df['dist'] = dist

# 60. How to know the maximum possible correlation value of each column against other columns
Compute maximum possible absolute correlation value of each column against other columns in df.

In [2]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), columns=list('pqrstuvwxy'), index=list('abcdefgh'))
df

Unnamed: 0,p,q,r,s,t,u,v,w,x,y
a,50,11,62,84,65,45,34,39,81,14
b,88,23,4,49,44,33,87,2,37,88
c,72,24,92,88,62,81,32,16,35,51
d,9,20,21,53,33,12,83,92,80,76
e,56,95,39,16,15,94,85,37,29,79
f,79,24,17,54,77,39,2,57,26,94
g,55,82,40,25,14,41,24,6,53,3
h,14,1,49,67,39,18,5,25,51,77


In [20]:
abs_corr = np.abs(df.corr())
abs_corr.apply(lambda x: sorted(x)[-2])

p    0.662916
q    0.839533
r    0.559045
s    0.839533
t    0.767757
u    0.583215
v    0.454688
w    0.526669
x    0.662916
y    0.511694
dtype: float64

# 61. Create a column containing the minimum by maximum of each row

In [33]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df.apply(lambda x: np.min(x)/np.max(x), axis=1)

0    0.180723
1    0.088608
2    0.022222
3    0.084211
4    0.052632
5    0.357143
6    0.065934
7    0.061856
dtype: float64

# 62. Create a column that contains the penultimate value(第二大) in each row 

In [47]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df['penultimate'] = df.apply(lambda x: sorted(x.unique())[-2], axis=1) # 注意要 unique
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,penultimate
0,79,37,72,53,44,42,50,83,39,18,79
1,41,48,52,99,77,79,37,53,65,53,79
2,18,42,71,34,90,74,67,77,76,20,77
3,22,64,51,86,19,38,89,50,92,71,89
4,3,84,90,28,64,30,18,25,93,40,90
5,75,36,59,76,77,37,51,2,81,30,77
6,34,44,95,66,32,59,78,23,4,48,78
7,7,80,26,71,66,81,95,42,81,88,88


# 63. Normalize all columns in a dataframe

In [48]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df
# 1. Normalize all columns of df by subtracting the column mean and divide by standard deviation.
# 2. Range all columns of df such that the minimum value in each column is 0 and max is 1.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,68,23,37,1,1,18,79,17,58,47
1,72,73,39,53,77,71,73,9,80,43
2,11,49,7,32,33,18,35,88,70,84
3,38,98,48,52,66,74,77,96,21,37
4,99,73,9,11,77,57,51,12,50,83
5,1,35,47,69,72,75,34,59,20,52
6,51,9,60,8,32,88,9,23,48,92
7,2,5,20,2,62,96,84,8,30,62


In [63]:
# 1
df.apply(lambda x: (x - x.mean()) / x.std(), axis=1)

# 2: min-max normalization
df.apply(lambda x: (x-np.min(x)) / (np.max(x) - np.min(x)), axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.858974,0.282051,0.461538,0.0,0.0,0.217949,1.0,0.205128,0.730769,0.589744
1,0.887324,0.901408,0.422535,0.619718,0.957746,0.873239,0.901408,0.0,1.0,0.478873
2,0.049383,0.518519,0.0,0.308642,0.320988,0.135802,0.345679,1.0,0.777778,0.950617
3,0.220779,1.0,0.350649,0.402597,0.584416,0.688312,0.727273,0.974026,0.0,0.207792
4,1.0,0.711111,0.0,0.022222,0.755556,0.533333,0.466667,0.033333,0.455556,0.822222
5,0.0,0.459459,0.621622,0.918919,0.959459,1.0,0.445946,0.783784,0.256757,0.689189
6,0.511905,0.011905,0.619048,0.0,0.285714,0.952381,0.011905,0.178571,0.47619,1.0
7,0.0,0.031915,0.191489,0.0,0.638298,1.0,0.87234,0.06383,0.297872,0.638298


# 64. Replace both the diagonals of dataframe with 0

In [77]:
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))

# 讓兩個對角線都變成 0
for i in range(df.shape[0]):
    df.iloc[i, i] = 0
    df.iloc[df.shape[0]-i-1, i] = 0
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,91,23,45,35,88,73,51,17,0
1,17,0,45,5,9,27,92,15,0,32
2,85,90,0,32,32,6,21,0,46,34
3,46,16,35,0,88,84,0,3,9,4
4,6,37,61,9,0,0,34,75,75,73
5,61,94,32,87,0,0,83,86,63,32
6,44,11,91,0,41,63,0,26,59,74
7,30,78,0,34,56,42,77,0,74,23
8,33,0,53,78,34,70,7,77,0,44
9,0,36,43,30,47,26,10,60,5,0


# 65. Get the particular group of a groupby dataframe by key

In [81]:
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})

df_grouped = df.groupby(['col1'])
df_grouped.get_group('apple')

Unnamed: 0,col1,col2,col3
0,apple,0.238017,9
3,apple,0.067686,9
6,apple,0.231352,5


# 66. Get the n’th largest value of a column when grouped by another column

In [100]:
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'taste': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

df

Unnamed: 0,fruit,taste,price
0,apple,0.32606,13
1,banana,0.869558,0
2,orange,0.374795,1
3,apple,0.046276,9
4,banana,0.704222,13
5,orange,0.35598,10
6,apple,0.043046,12
7,banana,0.777352,9
8,orange,0.498637,8


In [103]:
df.groupby('fruit').get_group('banana')['taste'].sort_values().iloc[-2]

0.7773515150938201

# 67. Compute grouped mean on pandas dataframe and keep the grouped column as another column (not index)

## groupby(as_index=False) 就可以避免 group 變成 index

In [113]:
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

df['mean price'] = df.groupby('fruit')['price'].transform('mean')
df

Unnamed: 0,fruit,rating,price,mean price
0,apple,0.50359,5,8.0
1,banana,0.744813,8,5.0
2,orange,0.598029,1,5.666667
3,apple,0.157625,13,8.0
4,banana,0.64299,7,5.0
5,orange,0.967671,3,5.666667
6,apple,0.759037,6,8.0
7,banana,0.669784,0,5.0
8,orange,0.323577,13,5.666667


In [114]:
df.groupby('fruit', as_index=False)['price'].mean()

Unnamed: 0,fruit,price
0,apple,8.0
1,banana,5.0
2,orange,5.666667


# 68. Join two dataframes by 2 columns so they have only the common rows

In [169]:
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})

In [126]:
# 1: 暴力 rename
pd.merge(df1.rename(columns={'fruit': 'fruit-pazham', 'weight': 'weight-kilo'}),
         df2.rename(columns={'pazham': 'fruit-pazham', 'kilo': 'weight-kilo'}), how='inner')

Unnamed: 0,fruit-pazham,weight-kilo,price
0,apple,high,0
1,orange,low,13


In [124]:
# 2: 這個才是題目要回答的作法，inner join key 左右表名稱不同，price 都要留下併標註是左表還右表的 price
pd.merge(df1, df2, how='inner', left_on=['fruit', 'weight'], right_on=['pazham', 'kilo'], suffixes=['_left', '_right'])

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,0,apple,high,0
1,apple,high,2,apple,high,0
2,apple,high,12,apple,high,0
3,orange,low,0,orange,low,13
4,orange,low,0,orange,low,13
5,orange,low,13,orange,low,13


# 69. Remove rows from a dataframe that are present in another dataframe

In [170]:
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})

# From df1, remove the rows that are present in df2. All three columns must be the same.

In [175]:
com1 = df1['fruit']+df1['weight']+df1['price'].to_string()
com2 = df2['pazham']+df2['kilo']+df2['price'].to_string()
df1[~com1.isin(com2)]

# 直接用 df1.isin(df2) 也可以達到 row by row 的比對
df1[~df1.isin(df2).all(axis=1)]

# 70. Get the positions where values of two columns match

In [199]:
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})

np.where(df['fruit1'] == df['fruit2'])

(array([0, 1, 6, 8], dtype=int64),)

# 71. Create lags and leads of a column in a dataframe?

pd.Series.shift

In [205]:
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4), columns = list('abcd'))

# lag 1
df['a_lag1'] = df['a'].shift(1)

# lead 1
df['b_lead1'] = df['b'].shift(-1)

df

Unnamed: 0,a,b,c,d,a_lag1,b_lead1
0,39,69,43,19,,22.0
1,45,22,22,27,39.0,13.0
2,31,13,93,44,45.0,97.0
3,8,97,90,48,31.0,41.0
4,87,41,48,16,8.0,


# 72. Get the frequency of unique values in the entire dataframe

In [218]:
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), columns = list('abcd'))

In [219]:
# 對每個 column 做 value_counts()
df.apply(lambda x: x.value_counts(), axis=0)

Unnamed: 0,a,b,c,d
1,2.0,1.0,,
2,1.0,1.0,,2.0
3,1.0,,2.0,1.0
4,1.0,,,
6,,1.0,1.0,2.0
8,,1.0,,
9,,1.0,2.0,


In [220]:
# 對整張表作 value_counts()
pd.value_counts(df.values.ravel())

6    4
3    4
2    4
9    3
1    3
8    1
4    1
dtype: int64

# 73. Split a text column into two separate columns?

In [265]:
df = pd.DataFrame(["STD, City    State",
"33, Kolkata    West Bengal",
"44, Chennai    Tamil Nadu",
"40, Hyderabad    Telengana",
"80, Bangalore    Karnataka"], columns=['row'])

df

Unnamed: 0,row
0,"STD, City State"
1,"33, Kolkata West Bengal"
2,"44, Chennai Tamil Nadu"
3,"40, Hyderabad Telengana"
4,"80, Bangalore Karnataka"


In [267]:
# Solution
df_out = df.row.str.split(',', expand=True)

# Make first row as header
new_header = df_out.iloc[0]
df_out = df_out[1:]
df_out.columns = new_header
df_out

Unnamed: 0,STD,City State
1,33,Kolkata West Bengal
2,44,Chennai Tamil Nadu
3,40,Hyderabad Telengana
4,80,Bangalore Karnataka
