# 1. 파이스파크와 판다스 연계하기 

## 판다스 모듈

In [27]:
import pandas as pd
import numpy as np

## 파이스파크에서 판다스 api 활용 

In [28]:
import pyspark.pandas as ps

## 스파크 세션 처리 

In [29]:
from pyspark.sql import SparkSession

## 1-1 시리즈 클래스 비교해보기 

In [30]:
ps_s = ps.Series([1, 3, 5, np.nan, 6, 8])

In [32]:
ps_s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [31]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [33]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [34]:
type(ps_s), type(s)

(pyspark.pandas.series.Series, pandas.core.series.Series)

In [35]:
s.value_counts()

1.0    1
3.0    1
5.0    1
6.0    1
8.0    1
dtype: int64

In [36]:
ps_s.value_counts()

1.0    1
3.0    1
5.0    1
6.0    1
8.0    1
dtype: int64

## 1-2 데이터프레임  비교해보기  

In [37]:
df = pd.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])

In [38]:
ps_df = ps.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])

In [39]:
type(ps_df), type(df)

(pyspark.pandas.frame.DataFrame, pandas.core.frame.DataFrame)

In [40]:
ps_df

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


In [41]:
df

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


## 1-3 판다스에서 파이스파크 내의 판다스로 변환

In [8]:
dates = pd.date_range('20130101', periods=6)

In [9]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [11]:
type(pdf)

pandas.core.frame.DataFrame

In [12]:
pdf

Unnamed: 0,A,B,C,D
2013-01-01,0.145406,-0.099882,-0.650064,-0.259359
2013-01-02,1.030907,-0.180774,-1.011417,-0.310595
2013-01-03,0.925675,0.235682,-0.556933,-0.457981
2013-01-04,-0.570068,-0.178906,0.098077,-0.665014
2013-01-05,-0.684657,1.362541,0.559109,-0.062809
2013-01-06,-0.580564,0.730574,-0.12014,0.217583


### 날짜 데이터 타입에서 에러가 발생해서 문자열로 변경 

In [13]:
pdf.index.dtype

dtype('<M8[ns]')

In [14]:
pdf.index =  pdf.index.astype('str')

In [15]:
pdf.index.dtype

dtype('O')

In [16]:
psdf = ps.from_pandas(pdf)

In [17]:
type(psdf)

pyspark.pandas.frame.DataFrame

In [18]:
psdf.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.145406,-0.099882,-0.650064,-0.259359
2013-01-02,1.030907,-0.180774,-1.011417,-0.310595
2013-01-03,0.925675,0.235682,-0.556933,-0.457981
2013-01-04,-0.570068,-0.178906,0.098077,-0.665014
2013-01-05,-0.684657,1.362541,0.559109,-0.062809


## 1-4 판다스를 파이스파크로 바로 변경 

In [19]:
spark = (SparkSession.builder.appName('ml-bank')
                             .config("spark.driver.host","127.0.0.1") 
                             .config("spark.driver.bindAddress","127.0.0.1")
                             .getOrCreate())


23/09/12 15:44:28 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [20]:
spark

In [21]:
sdf = spark.createDataFrame(pdf)

In [22]:
sdf.printSchema()

root
 |-- A: double (nullable = true)
 |-- B: double (nullable = true)
 |-- C: double (nullable = true)
 |-- D: double (nullable = true)



In [23]:
sdf.show()

+-------------------+--------------------+-------------------+--------------------+
|                  A|                   B|                  C|                   D|
+-------------------+--------------------+-------------------+--------------------+
| 0.1454056403456232|-0.09988151445490435|-0.6500635905936156|-0.25935895077556187|
| 1.0309068817982507|-0.18077436693820326|-1.0114169189103177|-0.31059481937563427|
| 0.9256754223834368| 0.23568195289837873|-0.5569332098855609|-0.45798071396996154|
|-0.5700684620461162|-0.17890550996997123|0.09807707063653792| -0.6650135664269996|
|-0.6846569727457262|  1.3625408263189376| 0.5591092049617086|-0.06280894787217835|
|-0.5805635171562358|  0.7305743862318991| -0.120139850387852|  0.2175825520257261|
+-------------------+--------------------+-------------------+--------------------+



In [24]:
psdf1 = sdf.pandas_api()

In [25]:
psdf1.head()

Unnamed: 0,A,B,C,D
0,0.145406,-0.099882,-0.650064,-0.259359
1,1.030907,-0.180774,-1.011417,-0.310595
2,0.925675,0.235682,-0.556933,-0.457981
3,-0.570068,-0.178906,0.098077,-0.665014
4,-0.684657,1.362541,0.559109,-0.062809


In [26]:
psdf1.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object