In [1]:
import csv
import random
from sklearn.linear_model import LinearRegression
import pandas as pd

In [2]:
# Set the seed 
random.seed(42)

### 1. 수강편람 데이터 불러오기

In [3]:
CSV = pd.DataFrame.from_csv("./data.csv", header=0)

#### 1.1 전공필수

In [4]:
CSV_required = CSV[CSV["required"] == 1]
CSV_required.head()

Unnamed: 0_level_0,year,spring,fall,academic_year,required,quota,enrollment,junior,double,second,all
year-semester,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-01-01,2007,1,0,2,1,440,338,49,0,0,297
2007-01-01,2007,1,0,3,1,200,116,49,0,0,297
2007-02-01,2007,0,1,2,1,420,250,49,0,0,280
2007-02-01,2007,0,1,3,1,260,161,49,0,0,280
2008-01-01,2008,1,0,3,1,210,103,51,0,0,297


#### 1.2 전공선택

In [5]:
CSV_not_required = CSV[CSV["required"] == 0]
CSV_not_required.head()

Unnamed: 0_level_0,year,spring,fall,academic_year,required,quota,enrollment,junior,double,second,all
year-semester,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-01-01,2007,1,0,2,0,80,55,49,0,0,297
2007-01-01,2007,1,0,3,0,310,185,49,0,0,297
2007-01-01,2007,1,0,4,0,540,201,49,0,0,297
2007-02-01,2007,0,1,2,0,230,113,49,0,0,280
2007-02-01,2007,0,1,3,0,440,159,49,0,0,280


### 2. 정규화(Normalization)

#### 2.1 전공필수

In [6]:
CSV_required = (CSV_required - CSV_required.mean()) / (CSV_required.max() - CSV_required.min())
CSV_required.head()

Unnamed: 0_level_0,year,spring,fall,academic_year,required,quota,enrollment,junior,double,second,all
year-semester,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-01-01,-0.47619,0.47619,-0.47619,-0.5,,0.221518,0.125873,-0.27619,-0.237037,-0.219048,-0.030099
2007-01-01,-0.47619,0.47619,-0.47619,0.5,,-0.188038,-0.23986,-0.27619,-0.237037,-0.219048,-0.030099
2007-02-01,-0.47619,-0.52381,0.52381,-0.5,,0.187388,-0.019103,-0.27619,-0.237037,-0.219048,-0.190476
2007-02-01,-0.47619,-0.52381,0.52381,0.5,,-0.085649,-0.165725,-0.27619,-0.237037,-0.219048,-0.190476
2008-01-01,-0.37619,0.47619,-0.47619,0.5,,-0.170974,-0.261277,-0.07619,-0.237037,-0.219048,-0.030099


#### 2.2 전공선택

In [7]:
CSV_not_required = (CSV_not_required - CSV_not_required.mean()) / (CSV_not_required.max() - CSV_not_required.min())
CSV_not_required.head()

Unnamed: 0_level_0,year,spring,fall,academic_year,required,quota,enrollment,junior,double,second,all
year-semester,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-01-01,-0.495714,0.514286,-0.514286,-0.257143,,-0.243541,-0.255442,-0.295714,-0.245079,-0.226286,-0.038544
2007-01-01,-0.495714,0.514286,-0.514286,0.07619,,0.080402,0.131463,-0.295714,-0.245079,-0.226286,-0.038544
2007-01-01,-0.495714,0.514286,-0.514286,0.409524,,0.404346,0.179082,-0.295714,-0.245079,-0.226286,-0.038544
2007-02-01,-0.495714,-0.485714,0.485714,-0.257143,,-0.032274,-0.082823,-0.295714,-0.245079,-0.226286,-0.198922
2007-02-01,-0.495714,-0.485714,0.485714,0.07619,,0.263501,0.054082,-0.295714,-0.245079,-0.226286,-0.198922


### 3. 선형회귀 데이터(column) 선택

In [8]:
feature_to_filter = ["academic_year", "junior", "double", "second", "all"]

train_dataset_required = CSV_required[feature_to_filter].values.tolist()
train_target_required = CSV_required["enrollment"].values.tolist()

train_dataset_not_required = CSV_not_required[feature_to_filter].values.tolist()
train_target_not_required = CSV_not_required["enrollment"].values.tolist()

### 4. 선형회귀(Linear Regression)

#### 4.1 전공필수

In [9]:
model_required = LinearRegression()
model_required.fit(train_dataset_required, train_target_required)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### 4.2 전공선택

In [10]:
model_not_required = LinearRegression()
model_not_required.fit(train_dataset_not_required, train_target_not_required)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### 5. 결과

#### 5.1 전공필수

In [11]:
result = pd.DataFrame(list(zip(feature_to_filter, model_required.coef_)), columns=["features", "coef"])
result.head(100)

Unnamed: 0,features,coef
0,academic_year,-0.312152
1,junior,0.08662
2,double,0.455853
3,second,-0.194786
4,all,0.059706


#### 5.2 전공선택

In [12]:
result = pd.DataFrame(list(zip(feature_to_filter, model_not_required.coef_)), columns=["features", "coef"])
result.head(100)

Unnamed: 0,features,coef
0,academic_year,0.412623
1,junior,-0.074518
2,double,0.782608
3,second,-0.439818
4,all,0.169728
