## 데이터프레임 생성

In [3]:
import numpy as np
import pandas as pd

print(np.__version__)
print(pd.__version__)

2.1.1
2.2.3


In [None]:
- 첫번째 방법 : 리스트 활용
- 두번째 방법 : 딕셔너리 활용 - 개인적 선호

In [9]:
data = [
    ["039900", "알파코", 10000000, 10.05],
    ["039910", "A", 5000000, 1.05],
    ["039920", "B", 1000, 1.28]
]

columns = ["종목코드", "종목명", "현재가", "등락률"]
df = pd.DataFrame( data = data, columns = columns)
df

Unnamed: 0,종목코드,종목명,현재가,등락률
0,39900,알파코,10000000,10.05
1,39910,A,5000000,1.05
2,39920,B,1000,1.28


In [8]:
data2 = {
    "종목코드" : ['03990', '03991', '03992'],
    "종목명" : ["알파코", "A", "B"]
}

df2 = pd.DataFrame(data = data2)
df2

Unnamed: 0,종목코드,종목명
0,3990,알파코
1,3991,A
2,3992,B


In [10]:
import pandas as pd
import random
import string


# Regenerating the dictionary where each key (종목코드, 종목명, 현재가, 등락률) has a list of values

data_dict = {
    "종목코드": [],
    "종목명": [],
    "현재가": [],
    "등락률": []
}

# Function to generate simpler 종목코드 and 종목명 ensuring the 종목코드 starts with '0'
def generate_code_name_for_dict(existing_codes):
    while True:
        code = '0' + ''.join(random.choices(string.digits, k=5))  # Ensure it starts with '0'
        name = ''.join(random.choices(string.ascii_uppercase, k=2))  # Simpler 종목명 with 2 letters
        if code not in existing_codes:
            return code, name

# Generating 10,000 rows of data
existing_codes_for_dict = set()

for _ in range(10000):
    code, name = generate_code_name_for_dict(existing_codes_for_dict)
    existing_codes_for_dict.add(code)
    current_price = random.randint(1000, 1000000)  # Simpler current price
    change_rate = round(random.uniform(-5, 5), 2)  # Simpler change rate
    
    data_dict["종목코드"].append(code)
    data_dict["종목명"].append(name)
    data_dict["현재가"].append(current_price)
    data_dict["등락률"].append(change_rate)

# Previewing a portion of the dictionary
data_dict_preview = {k: data_dict[k][:5] for k in data_dict}

pd.DataFrame(data_dict_preview)

Unnamed: 0,종목코드,종목명,현재가,등락률
0,65117,HS,192000,0.69
1,42983,UO,51702,-3.32
2,70482,IA,184732,1.92
3,12280,SS,766309,-1.7
4,86243,HK,346920,-0.07


In [11]:
df2 = pd.DataFrame(data_dict_preview)
df2

Unnamed: 0,종목코드,종목명,현재가,등락률
0,65117,HS,192000,0.69
1,42983,UO,51702,-3.32
2,70482,IA,184732,1.92
3,12280,SS,766309,-1.7
4,86243,HK,346920,-0.07


In [15]:
df2.head(1)  #(행 개수) #상단부터

Unnamed: 0,종목코드,종목명,현재가,등락률
0,65117,HS,192000,0.69


In [14]:
df2.tail(1) #최하단

Unnamed: 0,종목코드,종목명,현재가,등락률
4,86243,HK,346920,-0.07


In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   종목코드    5 non-null      object 
 1   종목명     5 non-null      object 
 2   현재가     5 non-null      int64  
 3   등락률     5 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 292.0+ bytes


In [17]:
df2.describe()

Unnamed: 0,현재가,등락률
count,5.0,5.0
mean,308332.6,-0.496
std,276541.596055,2.052421
min,51702.0,-3.32
25%,184732.0,-1.7
50%,192000.0,-0.07
75%,346920.0,0.69
max,766309.0,1.92


In [23]:
data2 = {
    "종목코드" : ['039900', '039910', '039920'], 
    "종목명" : ["알파코", "A", "B"], 
    "현재가" : [10000000, 500000, 1000], 
    "등락률" : [10.05, 1.05, 1.28]
}

df2 = pd.DataFrame(data = data2)
df2

Unnamed: 0,종목코드,종목명,현재가,등락률
0,39900,알파코,10000000,10.05
1,39910,A,500000,1.05
2,39920,B,1000,1.28


In [25]:
df2 =df2.set_index("종목코드")
df2

Unnamed: 0_level_0,종목명,현재가,등락률
종목코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
39900,알파코,10000000,10.05
39910,A,500000,1.05
39920,B,1000,1.28


In [26]:
df2.reset_index()

Unnamed: 0,종목코드,종목명,현재가,등락률
0,39900,알파코,10000000,10.05
1,39910,A,500000,1.05
2,39920,B,1000,1.28


In [22]:
df2.reset_index(drop=True)

Unnamed: 0,종목명,현재가,등락률
0,알파코,10000000,10.05
1,A,500000,1.05
2,B,1000,1.28


In [29]:
from pandas import DataFrame

data = [
    ["알파코", 10000000, 10.05], 
    ["A", 500000, 1.05], 
    ["B", 1000, 1.28]
]

index = ["039900", "039910", "039900"]
columns = ["종목명", "현재가", "등락률"]
df = DataFrame(data=data, index=index, columns=columns)
df

Unnamed: 0,종목명,현재가,등락률
39900,알파코,10000000,10.05
39910,A,500000,1.05
39900,B,1000,1.28


In [30]:
df.index

Index(['039900', '039910', '039900'], dtype='object')

In [33]:
df.values

array([['알파코', 10000000, 10.05],
       ['A', 500000, 1.05],
       ['B', 1000, 1.28]], dtype=object)

In [34]:
# 컬럼에 접근

df.현재가

039900    10000000
039910      500000
039900        1000
Name: 현재가, dtype: int64

In [36]:
df['현재가']

039900    10000000
039910      500000
039900        1000
Name: 현재가, dtype: int64

In [37]:
df[['현재가']]  #클래스가 다름 => 속성과 메서드도 다름 attribute erro => 클래스 안에 메서드가 없다 

Unnamed: 0,현재가
39900,10000000
39910,500000
39900,1000


## 로우 인덱싱
- loc, iloc
  + loc : label을 기준으로 인덱싱
  + iloc : index를 기준으로 인덱싱

In [40]:
import seaborn as sns

sns.__version__

'0.13.2'

In [42]:
iris = sns.load_dataset("iris")
iris.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [50]:
iris.loc[[0,5], ["sepal_width", "species"]]

Unnamed: 0,sepal_width,species
0,3.5,setosa
5,3.9,setosa


In [52]:
iris.iloc[[0,9],[1,4]]   #다른사람이 보기에 직관적이지 않음.

Unnamed: 0,sepal_width,species
0,3.5,setosa
9,3.1,setosa


In [93]:
#일종의 조건식 
#값을 가져오고 싶다면, True 값만 가져옴

#iris['sepal_width'] > 3.5

result = iris.loc[iris['sepal_width'] >= 4.0, :].reset_index(drop=True)

#reset_index(drop=True) 처리까지 해야 데이터 프레임과 인덱스 처리를 깔끔하게 해결 

In [63]:
result

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.8,4.0,1.2,0.2,setosa
1,5.7,4.4,1.5,0.4,setosa
2,5.2,4.1,1.5,0.1,setosa
3,5.5,4.2,1.4,0.2,setosa


In [111]:
iris.loc[iris['petal_width']> 2.3,['species']].reset_index(drop=True)

Unnamed: 0,species
0,virginica
1,virginica
2,virginica
3,virginica
4,virginica
5,virginica


In [112]:
iris.loc[iris['sepal_width'] <= 4.0, :].reset_index(drop=True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
142,6.7,3.0,5.2,2.3,virginica
143,6.3,2.5,5.0,1.9,virginica
144,6.5,3.0,5.2,2.0,virginica
145,6.2,3.4,5.4,2.3,virginica


In [97]:
#문자열 기반으로 조회

iris.loc[iris['species'] == 'setosa', :].reset_index(drop=True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [102]:
# 다중 조건
# iris.loc[(조건식1)&(조건식2),:]
# & : and 연산자
# | : or 연산자 

iris.loc[(iris['species'] == 'setosa') & (iris['sepal_width'] >= 4), :]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
14,5.8,4.0,1.2,0.2,setosa
15,5.7,4.4,1.5,0.4,setosa
32,5.2,4.1,1.5,0.1,setosa
33,5.5,4.2,1.4,0.2,setosa


In [101]:
iris.loc[(iris['species'] == 'virginica') & (iris['sepal_width'] >= 3.5), :]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
109,7.2,3.6,6.1,2.5,virginica
117,7.7,3.8,6.7,2.2,virginica
131,7.9,3.8,6.4,2.0,virginica


In [None]:
iris.loc[(iris['species'] == 'virginica') &
         (iris['sepal_width'] >= 3.5) & 
         (iris[], :]

## 컬럼 추가하기

In [116]:
iris2 = iris.copy()
iris2['newCol'] = 0
iris2['sepals'] = iris2['sepal_length'] + iris2['sepal_width']
iris2

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,newCol,sepals
0,5.1,3.5,1.4,0.2,setosa,0,8.6
1,4.9,3.0,1.4,0.2,setosa,0,7.9
2,4.7,3.2,1.3,0.2,setosa,0,7.9
3,4.6,3.1,1.5,0.2,setosa,0,7.7
4,5.0,3.6,1.4,0.2,setosa,0,8.6
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,0,9.7
146,6.3,2.5,5.0,1.9,virginica,0,8.8
147,6.5,3.0,5.2,2.0,virginica,0,9.5
148,6.2,3.4,5.4,2.3,virginica,0,9.6


In [117]:
from pandas import DataFrame

data = [
    ["알파코", 10000000, 10.05], 
    ["A", 500000, 1.05], 
    ["B", 1000, 1.28]
]

index = ["039900", "039910", "039900"]
columns = ["종목명", "현재가", "등락률"]
df = DataFrame(data=data, index=index, columns=columns)
df

Unnamed: 0,종목명,현재가,등락률
39900,알파코,10000000,10.05
39910,A,500000,1.05
39900,B,1000,1.28


In [119]:
df.drop("039900", axis = 0)

# axis = 0 : 행을 처리
# axis = 1 : 열을 처리 

Unnamed: 0,종목명,현재가,등락률
39910,A,500000,1.05


In [None]:
df.drop("종목명", axis = 1, inplace = True) #inplace를 사용하면 원본이 수정되서 되돌릴 수 없다. 

In [120]:
# 컬럼명 변경 

from pandas import DataFrame

data = [
    ["알파코", 10000000, 10.05], 
    ["A", 500000, 1.05], 
    ["B", 1000, 1.28]
]

index = ["039900", "039910", "039900"]
columns = ["종목명", "현재가", "등락률"]
df = DataFrame(data=data, index=index, columns=columns)
df2 = df.rename(columns = {'종목명' : 'code'})
df2                                #attribute error : 메서드가 없음

Unnamed: 0,code,현재가,등락률
39900,알파코,10000000,10.05
39910,A,500000,1.05
39900,B,1000,1.28


In [None]:
# 연습문제
# 03/02, 03/03 컬럼만 int형으로 변경하세요. 
# 쉼표를 제거하고 int로 변환하는 함수 정의 

In [11]:
import pandas as pd

# 데이터프레임 생성
data = [
    ["1,000", "1,100", '1,510'],
    ["1,410", "1,420", '1,790'],
    ["850", "900", '1,185'],
]
columns = ["03/02", "03/03", "03/04"]
df = pd.DataFrame(data=data, columns=columns)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   03/02   3 non-null      object
 1   03/03   3 non-null      object
 2   03/04   3 non-null      object
dtypes: object(3)
memory usage: 204.0+ bytes


In [12]:
x = "1,000"
int(x.replace(",",""))

1000

In [13]:
def rm_comma(x):
    return int(x.replace(",",""))

y = "1,000"
print(rm_comma(y))

1000


In [14]:
df["03/02"] = df["03/02"].apply(rm_comma)
df["03/03"] = df["03/03"].apply(rm_comma)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   03/02   3 non-null      int64 
 1   03/03   3 non-null      int64 
 2   03/04   3 non-null      object
dtypes: int64(2), object(1)
memory usage: 204.0+ bytes


In [9]:
## CG
data = [
    ["1,000", "1,100", '1,510'],
    ["1,410", "1,420", '1,790'],
    ["850", "900", '1,185'],
]
columns = ["03/02", "03/03", "03/04"]
df = pd.DataFrame(data=data, columns=columns)

# 쉼표 제거 후 정수형 변환
df = df.replace({',': ''}, regex=True).astype(int)

# 변환된 데이터프레임 확인
df.info()
print(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   03/02   3 non-null      int64
 1   03/03   3 non-null      int64
 2   03/04   3 non-null      int64
dtypes: int64(3)
memory usage: 204.0 bytes
   03/02  03/03  03/04
0   1000   1100   1510
1   1410   1420   1790
2    850    900   1185


## 데이터 내보내기

In [1]:
import seaborn as sns
iris = sns.load_dataset("iris")

iris.to_csv("iris_240930.csv", index=False)

In [2]:
iris.to_excel("iris_excel_240930.xlsx")

## 데이터 불러오기 

In [7]:
df = pd.read_csv("test/iris_240930.csv")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
import pandas as pd

excel_df = pd.read_excel("iris_excel_240930.xlsx")
excel_df.head(1)

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,setosa


In [4]:
import pandas as pd

df = pd.read_csv("seoul_real_estate.csv")
df.head(1)

Unnamed: 0,ACC_YEAR,SGG_CD,SGG_NM,BJDONG_CD,BJDONG_NM,LAND_GBN,LAND_GBN_NM,BONBEON,BUBEON,BLDG_NM,...,OBJ_AMT,BLDG_AREA,TOT_AREA,FLOOR,RIGHT_GBN,CNTL_YMD,BUILD_YEAR,HOUSE_TYPE,REQ_GBN,RDEALER_LAWDNM
0,2023,11650,서초구,10800,서초동,1.0,대지,1328.0,11.0,대우도씨에빛2,...,38500,38.81,50.85,14.0,,,2005.0,오피스텔,중개거래,서울 서초구
