# Removing Data

In [13]:
import pandas as pd
import numpy as np

---

## Droping rows

In [5]:
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                    columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [6]:
df.drop([0,2])

Unnamed: 0,A,B,C,D
1,4,5,6,7


## Droping columns

In [7]:
df.drop(columns=['B', 'C'])

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


## Drop duplicates

In [8]:
df = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [9]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [10]:
df.drop_duplicates() #حذف مقدار ۶ ام

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [11]:
df['v1'] = range(7)
df

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [12]:
df.drop_duplicates(['k1','k2']) 

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [13]:
df.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


---

# Detecting Outliers

![](https://quera.org/qbox/view/VK5rBXYm1C/outlier.png)

In [14]:
seri = pd.Series(np.random.randint(-10,10, size = 30))
seri[10] = -100
seri[27] = 50
np.array(seri)

array([   3,    7,   -9,   -4,    4,   -7,   -7,   -9,    4,   -3, -100,
          4,   -3,    8,   -3,   -1,    8,    9,    4,    8,   -3,   -4,
          2,    8,    6,    6,    8,   50,   -7,    1])

In [15]:
Q1 = seri.quantile(0.25)
Q3 = seri.quantile(0.75)
print(f"first quartile = {Q1}")
print(f"third quartile = {Q3}")

first quartile = -3.75
third quartile = 6.75


![](https://quera.org/qbox/view/TOF1Cyvzz8/iqr_quartiles.png)

In [16]:
IQR = Q3 - Q1
seri.drop(seri[(seri.values < Q1-1.5*IQR) | (seri.values > Q3+1.5*IQR)].index , inplace = True)
np.array(seri)

array([ 3,  7, -9, -4,  4, -7, -7, -9,  4, -3,  4, -3,  8, -3, -1,  8,  9,
        4,  8, -3, -4,  2,  8,  6,  6,  8, -7,  1])

---

# Missing and Null Values

In [17]:
from numpy import nan as NA

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], 
                     [NA, NA, NA], [NA, 6.5, 3.]])
data.index = ['A','B','C','D']
data

Unnamed: 0,0,1,2
A,1.0,6.5,3.0
B,1.0,,
C,,,
D,,6.5,3.0


In [18]:
data.isna()

Unnamed: 0,0,1,2
A,False,False,False
B,False,True,True
C,True,True,True
D,True,False,False


In [19]:
data.dropna()

Unnamed: 0,0,1,2
A,1.0,6.5,3.0


---

In [20]:
data = pd.DataFrame([[1., 6.5, 3., 8., 2.3], [1., NA, NA, 5, 1.4],
                     [NA, NA, 2,  NA, 4.5], [NA, 6.5, 3.,NA, 3.1],
                     [2.5, 8.1, NA, NA, 2], [NA , NA, NA , 4.2, NA],
                     [2, 5.2, 1.8, NA, 2.5]])
data.index = ['A','B','C','D','E','F','G']
data

Unnamed: 0,0,1,2,3,4
A,1.0,6.5,3.0,8.0,2.3
B,1.0,,,5.0,1.4
C,,,2.0,,4.5
D,,6.5,3.0,,3.1
E,2.5,8.1,,,2.0
F,,,,4.2,
G,2.0,5.2,1.8,,2.5


### Drop the rows with so much null values

In [21]:
data.dropna(thresh=3)

Unnamed: 0,0,1,2,3,4
A,1.0,6.5,3.0,8.0,2.3
B,1.0,,,5.0,1.4
D,,6.5,3.0,,3.1
E,2.5,8.1,,,2.0
G,2.0,5.2,1.8,,2.5


---

In [22]:
data = pd.DataFrame([[1., 6.5, 3., NA, 2.3], [1., NA, NA, 5, 1.4],
                     [NA, NA, 2,  NA, 4.5], [NA, 6.5, 3.,NA, 3.1],
                     [2.5, 8.1, NA, NA, 2], [NA , NA, 1, NA , NA], 
                     [2, 5.2, 1.8, NA, 2.5],  [3, 2.1, 8.1, NA, 2.5],
                     [NA, 8.2, NA, NA, 1.9], [2, NA, 1.5, NA, 7.2]])
data.index = ['A','B','C','D','E','F','G','H','I','J']
data

Unnamed: 0,0,1,2,3,4
A,1.0,6.5,3.0,,2.3
B,1.0,,,5.0,1.4
C,,,2.0,,4.5
D,,6.5,3.0,,3.1
E,2.5,8.1,,,2.0
F,,,1.0,,
G,2.0,5.2,1.8,,2.5
H,3.0,2.1,8.1,,2.5
I,,8.2,,,1.9
J,2.0,,1.5,,7.2


In [23]:
data.isna().sum()

0    4
1    4
2    3
3    9
4    1
dtype: int64

### Drop a coliumn with more than %90 Null value

In [24]:
data.drop(data.columns[ data.isna().sum()/len(data) >= 0.9] , axis = 1)

Unnamed: 0,0,1,2,4
A,1.0,6.5,3.0,2.3
B,1.0,,,1.4
C,,,2.0,4.5
D,,6.5,3.0,3.1
E,2.5,8.1,,2.0
F,,,1.0,
G,2.0,5.2,1.8,2.5
H,3.0,2.1,8.1,2.5
I,,8.2,,1.9
J,2.0,,1.5,7.2


---

# Correlation

*Correlation is a statistical measure that expresses the extent to which two variables are linearly related (meaning they change together at a constant rate). It’s a common tool for describing simple relationships without making a statement about cause and effect.*

![](https://www.jmp.com/en_ca/statistics-knowledge-portal/what-is-correlation/_jcr_content/par/styledcontainer_2069/par/image_5ae8.img.png/1556043162339.png)

![](https://quera.org/qbox/view/9K7OMlOOoZ/Pearson_Correlation_Coefficient_and_associated_scatterplots.png)

---

# Time Series

In [1]:
from datetime import date

### Date

In [2]:
today = date(2021,9,3)
print(today)

2021-09-03


In [3]:
print(today.year)
print(today.month)
print(today.day)
print(today.weekday())

2021
9
3
4


### Time

In [4]:
from datetime import time
t_now = time(23,14,5,10)
t_now

datetime.time(23, 14, 5, 10)

In [5]:
print(t_now.hour)
print(t_now.minute)
print(t_now.second)
print(t_now.microsecond)

23
14
5
10


### Datetime

In [6]:
from datetime import datetime
now = datetime(2021,9,3, 23,14,5,10)

print(now)

2021-09-03 23:14:05.000010


In [7]:
now_date = now.date()
print(now_date)
print(now_date.year)
print(now_date.month)
print(now_date.day)

2021-09-03
2021
9
3


In [8]:
now_time = now.time()
print(now_time)
print(now_time.hour)
print(now_time.minute)
print(now_time.second)
print(now_time.microsecond)

23:14:05.000010
23
14
5
10


### timedelta

In [9]:
t1 = date(2019,10,5)
t2 = date(2021, 11, 1)
t2 - t1

datetime.timedelta(days=758)

In [10]:
from datetime import timedelta 

difference = timedelta(days=1)
end_of_month = datetime(2021,8,31) 

next_day = end_of_month + difference
prev_day = end_of_month - difference

print(next_day)
print(prev_day)

2021-09-01 00:00:00
2021-08-30 00:00:00


### strptime

In [11]:
from datetime import datetime
date_in_string = "2021-08-03"

date_in_datetime = datetime.strptime(date_in_string,'%Y-%m-%d')
date_in_datetime

datetime.datetime(2021, 8, 3, 0, 0)

---

---

# Discretization

### cut

In [14]:
times = [datetime(2021,8,24, 12, 24), datetime(2021,9,2, 23, 49), datetime(2021,9,10, 2, 15), datetime(2021,8,30, 16, 35),\
         datetime(2021,9,4, 15, 40), datetime(2021,9,14, 18, 0), datetime(2021,9,12, 21, 0), datetime(2021,9,17, 20, 12),\
         datetime(2021,9,1, 7, 4), datetime(2021,9,10, 14, 20)]

df = pd.DataFrame()
df['time'] = times
df['area'] = [2, 4, 10, 4, 3, 1, 7, 5, 2, 8]
df

Unnamed: 0,time,area
0,2021-08-24 12:24:00,2
1,2021-09-02 23:49:00,4
2,2021-09-10 02:15:00,10
3,2021-08-30 16:35:00,4
4,2021-09-04 15:40:00,3
5,2021-09-14 18:00:00,1
6,2021-09-12 21:00:00,7
7,2021-09-17 20:12:00,5
8,2021-09-01 07:04:00,2
9,2021-09-10 14:20:00,8


In [15]:
dawn = time(hour=7,minute=0,second=0)
morning = time(hour=11,minute=0,second=0)
noon = time(hour=15,minute=0,second=0)
afternoon = time(hour=19,minute=0,second=0)

def discretize_time (time) :
  if time.time() < dawn : 
    return "dawn"
  elif time.time() < morning :
    return "morning"
  elif time.time() < noon :
    return "noon"
  elif time.time() < afternoon :
    return "afternoon"
  else : return "night"

df['time'] = df.time.apply(discretize_time)
df

Unnamed: 0,time,area
0,noon,2
1,night,4
2,dawn,10
3,afternoon,4
4,afternoon,3
5,afternoon,1
6,night,7
7,night,5
8,morning,2
9,noon,8


### Let's do it with cut function

In [16]:
df = pd.DataFrame()
df['name'] = ["helma","sajede","ali","hossein","mohsen","haniye","hasan","sajjad"]
df['age'] = [5, 15, 12, 60, 35, 17, 71, 23]
df

Unnamed: 0,name,age
0,helma,5
1,sajede,15
2,ali,12
3,hossein,60
4,mohsen,35
5,haniye,17
6,hasan,71
7,sajjad,23


In [17]:
bins = [1, 11, 18, 35, 50, 100]
labels = ["child", "teenager", "young", "middleaged", "old"]
df['age'] = pd.cut(df.age, bins, labels=labels)
df

Unnamed: 0,name,age
0,helma,child
1,sajede,teenager
2,ali,teenager
3,hossein,old
4,mohsen,young
5,haniye,teenager
6,hasan,old
7,sajjad,young
