# Pandas 소개
- dataset을 위한 python library
- 빅데이터를 분석하는데 편리한 기능을 제공
- Pandas : Python Data Analysis

---
- 참고자료
    - [w3schools Pandas tutorial](https://www.w3schools.com/python/pandas/default.asp)
    - [jvns Pandas Cookbook](https://github.com/jvns/pandas-cookbook/tree/master/cookbook)
    - [kaggle pandas tutorial](https://www.kaggle.com/learn/pandas)

## Start pandas and version check

In [1]:
#!pip install pandas
import  pandas as pd

In [2]:
print(pd.__version__)

1.3.4


## Series
- 1차원 배열
- label을 정해줄 수 있음
    - numpy.array와 차이점!

In [3]:
exam = [90, 70, 100, 95, 85]
examseries = pd.Series(exam)
examseries

0     90
1     70
2    100
3     95
4     85
dtype: int64

In [4]:
# Series labeling
examseries = pd.Series(exam, 
                       index = ['korean', 'english', 'math', 'society', 'science'])
examseries

korean      90
english     70
math       100
society     95
science     85
dtype: int64

In [5]:
examseries['math']

100

In [6]:
# create Series using Dictionary
dic_exam = {'korean':95, 'english':70, 'math':100, 'society':95, 'science':85}
examseries = pd.Series(dic_exam)
examseries

korean      95
english     70
math       100
society     95
science     85
dtype: int64

## DataFrame
- 다차원 행렬
- 이걸 제일 많이 씀

### Create DataFrame

In [7]:
# create DataFrame using Dictionary
data = {
  "tttangmin": [90, 70, 100, 95, 85],
  "damdami": [100, 90, 80, 100, 70]
}

class_score = pd.DataFrame(data,
                           index = ['korean', 'english', 'math', 'society', 'science'])
class_score

Unnamed: 0,tttangmin,damdami
korean,90,100
english,70,90
math,100,80
society,95,100
science,85,70


In [8]:
# load csv file
df_csv = pd.read_csv('data.csv')

In [11]:
# load txt file
df_txt = pd.read_csv('data.txt', sep=',')

In [10]:
# load json file
pd_json = pd.read_json('data.json')

### selecting DataFrame

In [14]:
reviews = pd.read_csv('winemag-data-130k-v2.csv', index_col=0)
reviews.head(3)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


#### column selecting

In [15]:
reviews.country #country 컬럼 선택

0            Italy
1         Portugal
2               US
3               US
4               US
            ...   
129966     Germany
129967          US
129968      France
129969      France
129970      France
Name: country, Length: 129971, dtype: object

In [16]:
reviews['country'] #country 컬럼 선택

0            Italy
1         Portugal
2               US
3               US
4               US
            ...   
129966     Germany
129967          US
129968      France
129969      France
129970      France
Name: country, Length: 129971, dtype: object

#### loc, iloc selecting
- df.iloc[row_num, col_num] : 숫자로 선택
- df.loc[condition] : label이나 조건으로 선택

In [17]:
reviews['country'][0]

'Italy'

In [18]:
reviews.iloc[0] #첫번째 행만

country                                                              Italy
description              Aromas include tropical fruit, broom, brimston...
designation                                                   Vulkà Bianco
points                                                                  87
price                                                                  NaN
province                                                 Sicily & Sardinia
region_1                                                              Etna
region_2                                                               NaN
taster_name                                                  Kerin O’Keefe
taster_twitter_handle                                         @kerinokeefe
title                                    Nicosia 2013 Vulkà Bianco  (Etna)
variety                                                        White Blend
winery                                                             Nicosia
Name: 0, dtype: object

In [19]:
reviews.iloc[:, 0] #첫번째 열만

0            Italy
1         Portugal
2               US
3               US
4               US
            ...   
129966     Germany
129967          US
129968      France
129969      France
129970      France
Name: country, Length: 129971, dtype: object

## View Data

In [24]:
reviews.head(3)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [23]:
reviews.tail(3)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss
129970,France,"Big, rich and off-dry, this is powered by inte...",Lieu-dit Harth Cuvée Caroline,90,21.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Schoffit 2012 Lieu-dit Harth Cuvée Car...,Gewürztraminer,Domaine Schoffit


In [25]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129971 entries, 0 to 129970
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   country                129908 non-null  object 
 1   description            129971 non-null  object 
 2   designation            92506 non-null   object 
 3   points                 129971 non-null  int64  
 4   price                  120975 non-null  float64
 5   province               129908 non-null  object 
 6   region_1               108724 non-null  object 
 7   region_2               50511 non-null   object 
 8   taster_name            103727 non-null  object 
 9   taster_twitter_handle  98758 non-null   object 
 10  title                  129971 non-null  object 
 11  variety                129970 non-null  object 
 12  winery                 129971 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 17.9+ MB


In [26]:
reviews.describe()

Unnamed: 0,points,price
count,129971.0,120975.0
mean,88.447138,35.363389
std,3.03973,41.022218
min,80.0,4.0
25%,86.0,17.0
50%,88.0,25.0
75%,91.0,42.0
max,100.0,3300.0


## Cleaning Data
- 비어있는 경우
- 데이터 포맷이 잘못된 경우
- 데이터가 잘못된경우
- 데이터 중복

In [37]:
df = pd.read_csv('odd_data.txt', sep=',', index_col=0)

In [38]:
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [40]:
new_df = df.dropna() # 또는 df.dropna(inplace = True)
new_df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [41]:
df = pd.read_csv('odd_data.txt', sep=',', index_col=0)
df.fillna(130, inplace = True)
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


## Corr

## Plot