In [131]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### 1. 데이터 로드

In [132]:
df = pd.read_csv('Train.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [133]:
# dataset 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


- column명 정리

Item_Identifier: Unique product ID, 고유 상품 번호

Item_Weight: Weight of product, 상품 무게

Item_Fat_Content: Whether the product is low fat or not, 저지방 함량 유무

Item_Visibility: The % of total display area of all products in a store allocated to the particular product, 모든 상품 대비 해당 상품의 디스플레이 비율. (매장 점유율로 해석)

Item_Type: The category to which the product belongs, 상품의 카테고리

Item_MRP: Maximum Retail Price (list price) of the product, 상품의 최대 소매값

Outlet_Identifier: Unique store ID, 고유 매장 ID

Outlet_Establishment_Year: The year in which store was established, 매장 설립 년도

Outlet_Size: The size of the store in terms of ground area covered, 매장 크기

Outlet_Location_Type: The type of city in which the store is located, 매장이 위치한 도시

Outlet_Type: Whether the outlet is just a grocery store or some sort of supermarket, 식료품점 또는 슈퍼마켓 분류

Item_Outlet_Sales: Sales of the product in the particular store. This is the outcome variable to be predicted., 특정 매장의 상품 판매 금액. 예측할 결과 변수

#### 2. 목표 및 모델 선정 과정
1) 매장 점유율이 높을수록, 판매금액이 늘어날 것인가?
    - 1. 판매상품(Item_Type)의 매장 점유율(Item_Visibility), 상품 판매금액(Item_Outlet_Sales), 최대 소매값(Item_MRP)의 연관성 파악
    - 2. 판매상품의 최대 소매값(Item_MRP), 상품의 저지방 함량유무(Item_Fat_Content)가 매장 점유율 또는 상품 판매금액에 영향을 미치는지 분석
2) 해결 방법 고안( 1)-1 ) : 단순선형회귀모델을 사용
    - 판매상품으로 groupby하고, ID개수 구하기
        - 판매상품 중 항목의 개수가 많은 상위 8개 항목을 지정하여 필터링
        - 상위 8개 항목의 점유율
    - 매장 점유율 
        - 평균 구하기 또는 0%인 데이터는 삭제 또는 평균값으로 채우기
    - 상품 판매금액
        - 평균 구하기
        - 상품별 금액 차이가 클 것으로 예상하여 데이터 스케일링이 필요할 것으로 보인다.  
    - 매장 점유율의 변동에 따른 상품의 판매금액을 알아보기 위해 단순선형회귀모델을 사용할 것
3) 해결 방법 고안( 1)-2 ): 단순선형회귀모델 사용
    - 최대 소매값 vs. 판매금액
    - 상품 무게 vs. 판매금액

#### 3. 문제1

- 데이터 가공: 판매상품 상위 8개 항목 구하기

In [134]:
# 판매상품으로 묶고, Item ID의 개수 파악하기
df.groupby('Item_Type')['Item_Identifier'].count()

Item_Type
Baking Goods              648
Breads                    251
Breakfast                 110
Canned                    649
Dairy                     682
Frozen Foods              856
Fruits and Vegetables    1232
Hard Drinks               214
Health and Hygiene        520
Household                 910
Meat                      425
Others                    169
Seafood                    64
Snack Foods              1200
Soft Drinks               445
Starchy Foods             148
Name: Item_Identifier, dtype: int64

In [135]:
# 판매상품의 수로 정렬
df.Item_Type.value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [136]:
# 판매상품의 수가 500개 이상인 것들만 추출
c = df.Item_Type.value_counts()
c[c >= 500]

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Name: Item_Type, dtype: int64

In [137]:
# 속성값 확인
best_items = c[ c >= 500].index
best_items

Index(['Fruits and Vegetables', 'Snack Foods', 'Household', 'Frozen Foods',
       'Dairy', 'Canned', 'Baking Goods', 'Health and Hygiene'],
      dtype='object')

In [138]:
# 데이터 필터링
best_items = df[df.Item_Type.isin(best_items)]
best_items

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.000000,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
6,FDO10,13.650,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
...,...,...,...,...,...,...,...,...,...,...,...,...
8517,FDF53,20.750,reg,0.083607,Frozen Foods,178.8318,OUT046,1997,Small,Tier 1,Supermarket Type1,3608.6360
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136


- 원하는 데이터를 찾기 위해 DataFrame 재구성

In [139]:
# 해당 컬럼 추출
best_items = best_items[['Item_Type', 'Item_Visibility', 'Item_Outlet_Sales', 'Item_MRP']]
best_items

Unnamed: 0,Item_Type,Item_Visibility,Item_Outlet_Sales,Item_MRP
0,Dairy,0.016047,3735.1380,249.8092
3,Fruits and Vegetables,0.000000,732.3800,182.0950
4,Household,0.000000,994.7052,53.8614
5,Baking Goods,0.000000,556.6088,51.4008
6,Snack Foods,0.012741,343.5528,57.6588
...,...,...,...,...
8517,Frozen Foods,0.083607,3608.6360,178.8318
8518,Snack Foods,0.056783,2778.3834,214.5218
8519,Baking Goods,0.046982,549.2850,108.1570
8520,Health and Hygiene,0.035186,1193.1136,85.1224


In [150]:
# Item_Type과 Item_Visibility
df_item_vis = best_items.groupby('Item_Type')[['Item_Visibility']].sum()
df_item_vis

Unnamed: 0_level_0,Item_Visibility
Item_Type,Unnamed: 1_level_1
Baking Goods,44.821706
Canned,44.215926
Dairy,49.395349
Frozen Foods,56.192325
Fruits and Vegetables,84.407946
Health and Hygiene,28.712309
Household,55.803305
Snack Foods,80.220267


In [158]:
# Item_Type과 Item_Outlet_Sales
df_items = best_items.groupby('Item_Type')[['Item_Outlet_Sales', 'Item_MRP']].mean()
df_items

Unnamed: 0_level_0,Item_Outlet_Sales,Item_MRP
Item_Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Baking Goods,1952.971207,126.380766
Canned,2225.194904,139.763832
Dairy,2232.542597,148.499208
Frozen Foods,2132.867744,138.503366
Fruits and Vegetables,2289.009592,144.581235
Health and Hygiene,2010.000265,130.818921
Household,2258.7843,149.424753
Snack Foods,2277.321739,146.194934


In [159]:
df_items['Item_Visibility'] = df_item_vis['Item_Visibility']
df_items

Unnamed: 0_level_0,Item_Outlet_Sales,Item_MRP,Item_Visibility
Item_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baking Goods,1952.971207,126.380766,44.821706
Canned,2225.194904,139.763832,44.215926
Dairy,2232.542597,148.499208,49.395349
Frozen Foods,2132.867744,138.503366,56.192325
Fruits and Vegetables,2289.009592,144.581235,84.407946
Health and Hygiene,2010.000265,130.818921,28.712309
Household,2258.7843,149.424753,55.803305
Snack Foods,2277.321739,146.194934,80.220267


- 데이터 문자열 가공하기    
    - Item_Outlet_Sales, Item_MRP, Item_Visibility 소수점 둘째자리에서 자르기

In [161]:
df_items.round(2)

Unnamed: 0_level_0,Item_Outlet_Sales,Item_MRP,Item_Visibility
Item_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baking Goods,1952.97,126.38,44.82
Canned,2225.19,139.76,44.22
Dairy,2232.54,148.5,49.4
Frozen Foods,2132.87,138.5,56.19
Fruits and Vegetables,2289.01,144.58,84.41
Health and Hygiene,2010.0,130.82,28.71
Household,2258.78,149.42,55.8
Snack Foods,2277.32,146.19,80.22
