7.1 Handling Missing Data

In [2]:
import pandas as pd
import numpy as np

In [3]:
float_data=pd.Series([1.2,-3.5,np.nan,0])

In [4]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [5]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data=pd.Series([
    "aardvark",np.nan,None,"avocado"
])

In [7]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [8]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [9]:
float_data=pd.Series([1,2,None],dtype="float64")

In [10]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [11]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

## Table 7-1. NA handling object methods

| Method   | Description                                                                 | Example (Python) | Output |
|----------|-----------------------------------------------------------------------------|------------------|--------|
| `dropna` | Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate | `pd.Series([1, np.nan, 3]).dropna()` | `[1.0, 3.0]` |
| `fillna` | Fill in missing data with some value or using an interpolation method such as `"ffill"` or `"bfill"` | `pd.Series([1, np.nan, 3]).fillna(0)` | `[1.0, 0.0, 3.0]` |
| `isna`   | Return Boolean values indicating which values are missing/NA                | `pd.Series([1, np.nan, 3]).isna()` | `[False, True, False]` |
| `notna`  | Negation of `isna`, returns True for non-NA values and False for NA values  | `pd.Series([1, np.nan, 3]).notna()` | `[True, False, True]` |

Filtering Out Missing Data

In [12]:
data=pd.Series(
    [
        1,np.nan,3.5,np.nan,7
    ]
)

In [13]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [14]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],                   
[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [16]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Passing how="all" will drop only rows that are all NA:

In [18]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [19]:
data[4]=np.nan

In [20]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


To drop columns in the same way, pass axis="columns":

In [21]:
data.dropna(axis="columns",how="all") # use rows instead of columns 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
df=pd.DataFrame(np.random.standard_normal((7,3)))

In [23]:
df

Unnamed: 0,0,1,2
0,1.217565,0.60359,0.838494
1,-0.11955,-2.34293,1.054814
2,-0.66055,1.040334,1.097837
3,0.221481,-0.127133,-0.575027
4,0.356558,-0.187517,-0.260789
5,-0.076458,0.451745,-1.740658
6,0.655369,-0.796971,-1.271566


In [24]:
df.iloc[:4:1]=np.nan

In [25]:
df.iloc[:2,2]=np.nan

In [26]:
df

Unnamed: 0,0,1,2
0,,,
1,,,
2,,,
3,,,
4,0.356558,-0.187517,-0.260789
5,-0.076458,0.451745,-1.740658
6,0.655369,-0.796971,-1.271566


In [27]:
df.dropna()

Unnamed: 0,0,1,2
4,0.356558,-0.187517,-0.260789
5,-0.076458,0.451745,-1.740658
6,0.655369,-0.796971,-1.271566


thresh=2 â†’ Sets a threshold: a row must have at least 2 non-NaN values to be kept.

If a row has fewer than 2 non-null entries, it will be dropped.

In [28]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
4,0.356558,-0.187517,-0.260789
5,-0.076458,0.451745,-1.740658
6,0.655369,-0.796971,-1.271566


# Filling In Missing Data

In [29]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.356558,-0.187517,-0.260789
5,-0.076458,0.451745,-1.740658
6,0.655369,-0.796971,-1.271566


In [30]:
df.fillna(
    {1:0.5,2:0}
)

Unnamed: 0,0,1,2
0,,0.5,0.0
1,,0.5,0.0
2,,0.5,0.0
3,,0.5,0.0
4,0.356558,-0.187517,-0.260789
5,-0.076458,0.451745,-1.740658
6,0.655369,-0.796971,-1.271566


In [31]:
df=pd.DataFrame(np.random.standard_normal((6,3)))

In [32]:
df

Unnamed: 0,0,1,2
0,0.796679,-0.345025,-1.825701
1,1.009427,1.04131,-0.128338
2,-0.511743,-1.926985,1.260908
3,0.304825,0.732156,0.866788
4,-0.478738,-1.527161,0.695577
5,-0.423835,0.671577,1.197358


In [33]:
df.iloc[2:,1]=np.nan

In [34]:
df.iloc[4:,2]=np.nan

In [35]:
df

Unnamed: 0,0,1,2
0,0.796679,-0.345025,-1.825701
1,1.009427,1.04131,-0.128338
2,-0.511743,,1.260908
3,0.304825,,0.866788
4,-0.478738,,
5,-0.423835,,


In [36]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,0.796679,-0.345025,-1.825701
1,1.009427,1.04131,-0.128338
2,-0.511743,1.04131,1.260908
3,0.304825,1.04131,0.866788
4,-0.478738,1.04131,0.866788
5,-0.423835,1.04131,0.866788


In [37]:
df.fillna(method="ffill",limit=2)

  df.fillna(method="ffill",limit=2)


Unnamed: 0,0,1,2
0,0.796679,-0.345025,-1.825701
1,1.009427,1.04131,-0.128338
2,-0.511743,1.04131,1.260908
3,0.304825,1.04131,0.866788
4,-0.478738,,0.866788
5,-0.423835,,0.866788


In [38]:
data=pd.Series(
    [1.,np.nan,3.5,np.nan,7]
)

In [39]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [40]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## Table 7-2. `fillna` function arguments

| Argument | Description                                                                 | Example (Python) | Output |
|----------|-----------------------------------------------------------------------------|------------------|--------|
| `value`  | Scalar value or dictionary-like object to use to fill missing values        | `pd.Series([1, np.nan, 3]).fillna(value=0)` | `[1.0, 0.0, 3.0]` |
| `method` | Interpolation method: `"bfill"` (backward fill) or `"ffill"` (forward fill); default is `None` | `pd.Series([1, np.nan, np.nan, 4]).fillna(method="ffill")` | `[1.0, 1.0, 1.0, 4.0]` |
| `axis`   | Axis to fill on (`"index"` or `"columns"`); default is `"index"`            | `df.fillna(method="ffill", axis="columns")` | Missing values filled column-wise |
| `limit`  | For forward/backward filling, maximum number of consecutive periods to fill | `pd.Series([1, np.nan, np.nan, 4]).fillna(method="ffill", limit=1)` | `[1.0, 1.0, NaN, 4.0]` |

# 7.2 Data Transformation

Removing Duplicates

In [41]:
data=pd.DataFrame({
    "k1":["one","two"]*3+["two"],
    "k2":[1,1,2,3,3,4,4]
})

In [42]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [43]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [44]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [45]:
data["v1"]=range(7)

In [46]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [47]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [48]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [49]:
data.drop_duplicates(["k1", "k2"],keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


Transforming Data Using a Function or Mapping

In [50]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                             "pastrami", "corned beef", "bacon",
                             "pastrami", "honey ham", "nova lox"],
                      "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [51]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [52]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [53]:
data["animal"]=data["food"].map(meat_to_animal)

In [54]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [55]:
def get_animal(x):
    return meat_to_animal[x]

In [56]:
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Replacing Values

In [57]:
data=pd.Series([1.,-999.,2.,-999.,-1000.,3.])

In [58]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [59]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [60]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [61]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

Renaming Axis Indexes

In [62]:
data=pd.DataFrame(
    np.arange(12).reshape((3,4)),
    index=["Ohio","Colorado","New York"],
    columns=["one","two","three","four"]
)

In [63]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [64]:
def transform(x):
    return x[:4].upper()

In [65]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [66]:
data.index=data.index.map(transform)

In [67]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [68]:
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [69]:
data.rename(
    index={"OHIO":"INDIANA"},
    columns={"three":"peekaboo"}
)

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# Discretization and Binning

In [70]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [71]:
bins=[18, 25, 35, 60, 100]

In [72]:
age_catgeories=pd.cut(ages,bins)

In [73]:
age_catgeories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [74]:
age_catgeories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [75]:
age_catgeories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [76]:
age_catgeories.categories[0]

Interval(18, 25, closed='right')

In [77]:
pd.value_counts(age_catgeories)

  pd.value_counts(age_catgeories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [78]:
pd.cut(ages,bins,right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [79]:
group_names=["Youth","YoungAdult","MiddleAged","Senior"]

In [80]:
pd.cut(ages,bins,labels=group_names)
data1=pd.cut(ages,bins,labels=group_names)

In [81]:
data1.value_counts()

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
Name: count, dtype: int64

In [82]:
data=np.random.uniform(size=20)

In [83]:
data

array([0.59060699, 0.03711806, 0.7268219 , 0.7636184 , 0.00617176,
       0.63677817, 0.34879068, 0.38358527, 0.99431134, 0.37182939,
       0.71399813, 0.71232119, 0.41225645, 0.34002366, 0.91946729,
       0.26833609, 0.34088206, 0.10610435, 0.54186798, 0.59056927])

In [84]:
pd.cut(data,4,precision=2)

[(0.5, 0.75], (0.0052, 0.25], (0.5, 0.75], (0.75, 0.99], (0.0052, 0.25], ..., (0.25, 0.5], (0.25, 0.5], (0.0052, 0.25], (0.5, 0.75], (0.5, 0.75]]
Length: 20
Categories (4, interval[float64, right]): [(0.0052, 0.25] < (0.25, 0.5] < (0.5, 0.75] < (0.75, 0.99]]

In [85]:
data=np.random.standard_normal(1000)

In [86]:
data

array([ 1.78141404e-01,  1.85543336e+00, -1.14350600e+00, -1.27511330e-01,
       -5.54524200e-01, -1.22902663e+00,  1.96655751e+00, -1.31020263e+00,
       -6.41485080e-01, -7.40828034e-01,  1.51422332e+00,  1.09098447e+00,
       -2.05898321e+00, -4.68417603e-01,  1.36263838e+00, -1.60958389e+00,
        2.61836186e-01, -6.54108369e-01, -6.90982184e-01,  9.24116770e-02,
        1.74021802e-01,  3.25242219e-01, -3.05780219e-01,  6.94597362e-02,
       -5.25297345e-01,  7.15905732e-01,  5.72601552e-01, -7.83935970e-01,
        2.04533028e+00, -2.92977982e-02,  1.02918157e+00,  1.27054171e+00,
       -1.04470743e+00, -1.12214900e+00,  1.86711414e-01,  3.80852966e-01,
        4.15757418e-01, -1.52035478e+00,  9.05115520e-02, -1.09397510e+00,
       -2.42386789e-01,  1.77534886e+00, -1.44388822e+00, -2.35628718e+00,
        2.83891694e-01, -4.66763949e-01, -6.54890850e-01, -1.18631798e+00,
        1.15119304e+00, -7.51916228e-02,  1.58312699e+00, -3.86968589e-01,
       -5.41138441e-01,  

In [87]:
quartiles=pd.qcut(data,4,precision=2)

In [88]:
quartiles

[(-0.0021, 0.63], (0.63, 2.86], (-2.8899999999999997, -0.68], (-0.68, -0.0021], (-0.68, -0.0021], ..., (0.63, 2.86], (-0.0021, 0.63], (0.63, 2.86], (-0.0021, 0.63], (-0.68, -0.0021]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.8899999999999997, -0.68] < (-0.68, -0.0021] < (-0.0021, 0.63] < (0.63, 2.86]]

In [89]:
pd.value_counts(quartiles)

  pd.value_counts(quartiles)


(-2.8899999999999997, -0.68]    250
(-0.68, -0.0021]                250
(-0.0021, 0.63]                 250
(0.63, 2.86]                    250
Name: count, dtype: int64

In [90]:
pd.qcut(data,[0,0.1,0.5,0.9,1]).value_counts()

(-2.885, -1.279]      100
(-1.279, -0.00209]    400
(-0.00209, 1.265]     400
(1.265, 2.862]        100
Name: count, dtype: int64


# Detecting and Filtering Outliers

In [91]:
data=pd.DataFrame(np.random.standard_normal((1000,4)))

In [92]:
data

Unnamed: 0,0,1,2,3
0,1.306375,0.884125,-1.718732,-0.493724
1,-2.037998,-0.047604,-0.951057,-0.096360
2,0.829827,-0.515405,-0.280152,-1.794366
3,-0.812637,0.080343,0.274540,-1.978524
4,-0.899629,-0.895350,-1.691873,0.821900
...,...,...,...,...
995,-0.542000,-0.280235,1.044118,1.600900
996,-0.614647,-0.992824,-0.204217,-0.574071
997,-0.303970,3.440370,-0.622323,0.176235
998,-2.466074,-0.206688,-0.043951,-0.108638


In [93]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.005181,-0.009208,-0.022848,-0.001619
std,0.990935,1.027777,0.96623,1.011875
min,-3.535306,-3.79029,-2.940027,-3.316629
25%,-0.719862,-0.716032,-0.683629,-0.695518
50%,-0.019453,-0.002004,-0.036381,0.005877
75%,0.654721,0.684557,0.574691,0.680384
max,3.784889,3.44037,2.965389,2.873746


In [94]:
col=data[2]

In [95]:
col

0     -1.718732
1     -0.951057
2     -0.280152
3      0.274540
4     -1.691873
         ...   
995    1.044118
996   -0.204217
997   -0.622323
998   -0.043951
999   -1.487115
Name: 2, Length: 1000, dtype: float64

In [96]:
col[col.abs()>3]

Series([], Name: 2, dtype: float64)

In [97]:
data[(data.abs()>3).any(axis="columns")]

Unnamed: 0,0,1,2,3
11,3.756849,-0.474253,1.042059,1.24651
16,3.784889,2.699173,-1.497224,0.434116
227,-0.361221,-0.276542,-0.240584,-3.077106
237,3.199117,0.665594,-0.8557,-0.590922
336,1.062829,0.020302,-0.956543,-3.265822
466,-0.25436,1.349296,-0.485371,-3.101344
590,1.281145,-3.79029,-1.982314,-0.89986
829,-3.535306,1.264811,1.879343,0.518648
974,1.655772,1.201009,-0.528882,-3.316629
997,-0.30397,3.44037,-0.622323,0.176235


In [98]:
np.sign(data)*3

Unnamed: 0,0,1,2,3
0,3.0,3.0,-3.0,-3.0
1,-3.0,-3.0,-3.0,-3.0
2,3.0,-3.0,-3.0,-3.0
3,-3.0,3.0,3.0,-3.0
4,-3.0,-3.0,-3.0,3.0
...,...,...,...,...
995,-3.0,-3.0,3.0,3.0
996,-3.0,-3.0,-3.0,-3.0
997,-3.0,3.0,-3.0,3.0
998,-3.0,-3.0,-3.0,-3.0


In [99]:
data[data.abs()>3]=np.sign(data)*3

In [100]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.006386,-0.008858,-0.022848,-0.000858
std,0.983235,1.023778,0.96623,1.009522
min,-3.0,-3.0,-2.940027,-3.0
25%,-0.719862,-0.716032,-0.683629,-0.695518
50%,-0.019453,-0.002004,-0.036381,0.005877
75%,0.654721,0.684557,0.574691,0.680384
max,3.0,3.0,2.965389,2.873746


In [101]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0
2,1.0,-1.0,-1.0,-1.0
3,-1.0,1.0,1.0,-1.0
4,-1.0,-1.0,-1.0,1.0


# Permutation and Random Sampling

In [102]:
df=pd.DataFrame(np.arange(5*7).reshape((5,7)))

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [104]:
sampler=np.random.permutation(5)

In [105]:
sampler

array([4, 1, 2, 3, 0], dtype=int32)

In [106]:
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
0,0,1,2,3,4,5,6


In [107]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
0,0,1,2,3,4,5,6


In [108]:
columnn_sampler=np.random.permutation(7)

In [109]:
columnn_sampler

array([5, 2, 6, 0, 1, 4, 3], dtype=int32)

In [110]:
df.take(columnn_sampler,axis="columns")

Unnamed: 0,5,2,6,0,1,4,3
0,5,2,6,0,1,4,3
1,12,9,13,7,8,11,10
2,19,16,20,14,15,18,17
3,26,23,27,21,22,25,24
4,33,30,34,28,29,32,31


In [112]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [113]:
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20


In [114]:
choices=pd.Series([5,7,-1,6,4])

In [116]:
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [119]:
choices.sample(n=10,replace=True)

1    7
4    4
3    6
2   -1
4    4
4    4
2   -1
4    4
4    4
1    7
dtype: int64

Computing Indicator/Dummy Variables

In [120]:
df=pd.DataFrame({
    "key":list("bbacab"),
    "data1":range(6)
})

In [121]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [122]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [146]:
dummies=pd.get_dummies(df["key"],prefix="key")

In [147]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [126]:
df_with_dummy=df[["data1"]].join(dummies)

In [127]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,False,True,False
1,1,False,True,False
2,2,True,False,False
3,3,False,False,True
4,4,True,False,False
5,5,False,True,False


In [128]:
mnames = ["movie_id", "title", "genres"]

In [133]:
movies=pd.read_table("datasets/movielens/movies.dat",sep="::",header=None,names=mnames,engine="python")

In [136]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [137]:
dummies=movies["genres"].str.get_dummies("|")

In [141]:
dummies.iloc[:10,:6]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime
0,0,0,1,1,1,0
1,0,1,0,1,0,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0
5,1,0,0,0,0,1
6,0,0,0,0,1,0
7,0,1,0,1,0,0
8,1,0,0,0,0,0
9,1,1,0,0,0,0


In [142]:
movies_windic=movies.join(dummies.add_prefix("Genre_"))

In [151]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western       

In [152]:
np.random.seed(12345)

In [160]:
values=np.random.uniform(size=10)

In [161]:
values

array([0.74771481, 0.96130674, 0.0083883 , 0.10644438, 0.29870371,
       0.65641118, 0.80981255, 0.87217591, 0.9646476 , 0.72368535])

In [162]:
bins=[0,0.2,0.4,0.6,0.8,1]

In [163]:
pd.cut(values,bins)

[(0.6, 0.8], (0.8, 1.0], (0.0, 0.2], (0.0, 0.2], (0.2, 0.4], (0.6, 0.8], (0.8, 1.0], (0.8, 1.0], (0.8, 1.0], (0.6, 0.8]]
Categories (5, interval[float64, right]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [164]:
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,False,False,False,True,False
1,False,False,False,False,True
2,True,False,False,False,False
3,True,False,False,False,False
4,False,True,False,False,False
5,False,False,False,True,False
6,False,False,False,False,True
7,False,False,False,False,True
8,False,False,False,False,True
9,False,False,False,True,False


7.3 Extension Data Types

In [165]:
s=pd.Series([1,2,3,None])

In [166]:
s

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [167]:
s.dtype

dtype('float64')

In [168]:
s=pd.Series([1,2,3,None],dtype=pd.Int64Dtype())

In [169]:
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [170]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

In [171]:
s[3]

<NA>

In [173]:
s[3] is pd.NA

True

In [174]:
s=pd.Series([1,2,3,None],dtype="Int64")

In [175]:
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [176]:
s=pd.Series(["one","two",None,"three"],dtype=pd.StringDtype())

In [177]:
s

0      one
1      two
2     <NA>
3    three
dtype: string

In [178]:
dict={
    "A":[1,2,None,4],
    "B":["one","two","three",None],
    "C":[False,None,False,True]
}

In [179]:
df=pd.DataFrame(dict)

In [180]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [181]:
df["A"]=df["A"].astype("Int64")

In [183]:
df["B"]=df["B"].astype("string")

In [185]:
df["C"]=df["C"].astype("boolean")

In [186]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


## Table 7-3. pandas extension data types

| Extension type   | Description                                                                 | Example (Python) |
|------------------|-----------------------------------------------------------------------------|------------------|
| `BooleanDtype`   | Nullable Boolean data, use `"boolean"` when passing as string               | `pd.Series([True, False, None], dtype="boolean")` |
| `CategoricalDtype` | Categorical data type, use `"category"` when passing as string            | `pd.Series(["a","b","a"], dtype="category")` |
| `DatetimeTZDtype` | Datetime with time zone                                                    | `pd.Series(pd.date_range("2024-01-01", periods=3, tz="UTC"))` |
| `Float32Dtype`   | 32-bit nullable floating point, use `"Float32"` when passing as string      | `pd.Series([1.0, None, 3.5], dtype="Float32")` |
| `Float64Dtype`   | 64-bit nullable floating point, use `"Float64"` when passing as string      | `pd.Series([1.0, None, 3.5], dtype="Float64")` |
| `Int8Dtype`      | 8-bit nullable signed integer, use `"Int8"` when passing as string          | `pd.Series([1, None, 3], dtype="Int8")` |
| `Int16Dtype`     | 16-bit nullable signed integer, use `"Int16"` when passing as string        | `pd.Series([1, None, 3], dtype="Int16")` |
| `Int32Dtype`     | 32-bit nullable signed integer, use `"Int32"` when passing as string        | `pd.Series([1, None, 3], dtype="Int32")` |
| `Int64Dtype`     | 64-bit nullable signed integer, use `"Int64"` when passing as string        | `pd.Series([1, None, 3], dtype="Int64")` |
| `UInt8Dtype`     | 8-bit nullable unsigned integer, use `"UInt8"` when passing as string       | `pd.Series([1, None, 3], dtype="UInt8")` |
| `UInt16Dtype`    | 16-bit nullable unsigned integer, use `"UInt16"` when passing as string     | `pd.Series([1, None, 3], dtype="UInt16")` |
| `UInt32Dtype`    | 32-bit nullable unsigned integer, use `"UInt32"` when passing as string     | `pd.Series([1, None, 3], dtype="UInt32")` |
| `UInt64Dtype`    | 64-bit nullable unsigned integer, use `"UInt64"` when passing as string     | `pd.Series([1, None, 3], dtype="UInt64")` |

# 7.4 String Manipulation