In [3]:
#Missing data occurs commonly in many data analysis applications. One of the goals of pandas is to make working with missing data as painless as possible. For example, all of the descriptive statistics on pandas objects exclude missing data by default.
import pandas as pd
import numpy as np

float_data = pd.Series([1.2, -3.5, np.nan, 0])

float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [7]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
data.dropna(how="all")
#Passing how="all" will drop only rows that are all NA:

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
#Keep in mind that these functions return new objects by default and do not modify the contents of the original object.

In [11]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [12]:
#To drop columns in the same way, pass axis="columns":
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [13]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [15]:
df.iloc[:4, 1] = np.nan

In [16]:
df.iloc[:2, 2] = np.nan
df


Unnamed: 0,0,1,2
0,0.837828,,
1,0.680403,,
2,1.518855,,-2.328722
3,0.690496,,1.134099
4,-1.50717,0.562075,0.042582
5,0.544544,0.550185,2.0284
6,1.61183,1.191153,0.405608


In [17]:
 df.dropna()

Unnamed: 0,0,1,2
4,-1.50717,0.562075,0.042582
5,0.544544,0.550185,2.0284
6,1.61183,1.191153,0.405608


In [18]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.518855,,-2.328722
3,0.690496,,1.134099
4,-1.50717,0.562075,0.042582
5,0.544544,0.550185,2.0284
6,1.61183,1.191153,0.405608


In [19]:
 df.fillna(0)

Unnamed: 0,0,1,2
0,0.837828,0.0,0.0
1,0.680403,0.0,0.0
2,1.518855,0.0,-2.328722
3,0.690496,0.0,1.134099
4,-1.50717,0.562075,0.042582
5,0.544544,0.550185,2.0284
6,1.61183,1.191153,0.405608


In [20]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.837828,0.5,0.0
1,0.680403,0.5,0.0
2,1.518855,0.5,-2.328722
3,0.690496,0.5,1.134099
4,-1.50717,0.562075,0.042582
5,0.544544,0.550185,2.0284
6,1.61183,1.191153,0.405608


In [22]:
#Removing Duplicates
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],"k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [23]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [24]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [28]:
data["vsdasd"] = range(7)
data

Unnamed: 0,k1,k2,v1,v2,vsdasd
0,one,1,0,0,0
1,two,1,1,1,1
2,one,2,2,2,2
3,two,3,3,3,3
4,one,3,4,4,4
5,two,4,5,5,5
6,two,4,6,6,6


In [30]:
data.drop_duplicates(subset=["k1"])

#removes dublicate rows based on k1 and k1 has onyl one and twos

Unnamed: 0,k1,k2,v1,v2,vsdasd
0,one,1,0,0,0
1,two,1,1,1,1


In [32]:
data.drop_duplicates(subset=["v1"])
#as you can see nothing happend bcause v1 all diffrent

Unnamed: 0,k1,k2,v1,v2,vsdasd
0,one,1,0,0,0
1,two,1,1,1,1
2,one,2,2,2,2
3,two,3,3,3,3
4,one,3,4,4,4
5,two,4,5,5,5
6,two,4,6,6,6


In [33]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon","pastrami", "corned beef", "bacon","pastrami", "honey ham", "nova lox"],"ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [34]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [36]:
data["animal"] = data["food"].map(meat_to_animal)

data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [37]:
def get_animal(x):
    return meat_to_animal[x]

data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [42]:
#renaming axis indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=["Ohio", "Colorado", "New York"],columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [46]:
def transform(x):
    return x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [47]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [48]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [50]:
data.rename(index={"OHIO": "Değişti"},columns={"three": "peekaboo"})


Unnamed: 0,one,two,peekaboo,four
Değişti,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [51]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]


In [52]:
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [53]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]

pd.cut(ages,bins,labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']