In [1]:
# 필요한 패키지 Import

import numpy as np
import pandas as pd
from glob import glob
import json

In [2]:
# csv 파일 읽어오기
train = pd.read_csv('train_Y.csv')
submit = pd.read_csv('sample_submit.csv')

# 라벨 분리
y_train = train['label']

# train_user, test_user 합치기
train_user = train['user']
test_user = submit['user']
total_user = pd.concat([train_user, test_user])
total_user = total_user.sort_values().reset_index(drop=True)

In [3]:
# json 데이터 불러오기
sleep_list = sorted(glob('StduentLife_EMA/EMA/response/Sleep/*.json'))
social_list = sorted(glob('StduentLife_EMA/EMA/response/Social/*.json'))
activity_list = sorted(glob('StduentLife_EMA/EMA/response/Activity/*.json'))

sleep = np.array([sleep for sleep in sleep_list for user in total_user if user in sleep])
social = np.array([social for social in social_list for user in total_user if user in social])
activity = np.array([activity for activity in activity_list for user in total_user if user in activity])

In [4]:
# Sleep json 데이터 파싱
# 'hour' 질문은 int(응답) * 0.5 + 2.5로 값 계산
# 'rate' 질문은 (int(응답) - 5) * -1로 값 계산

people_hour = []
people_rate = []

hour_choice_set = [str(x) for x in range (1, 20)]
rate_choice_set = [str(x) for x in range (1, 5)]


for data in sleep: 
    sleep_data = json.load(open(data))
    
    people_hour.append(['NaN' if res['hour'] not in hour_choice_set else int(res['hour']) * 0.5 + 2.5 for res in sleep_data if 
    'hour' in res])
    people_rate.append(['NaN' if res['rate'] not in rate_choice_set else ((int(res['rate']) - 5) * -1) for res in sleep_data if 
    'rate' in res])

In [24]:
# Social json 데이터 파싱
# 'number' 질문에 해당하는 값을 미리 배열을 만들어서 응답 파싱

people_contact = []

social_answer_set = ['NaN', 0, 5, 10, 20, 50, 100]
social_choice_set = [str(x) for x in range (1, 7)]

for data in social:
    social_data = json.load(open(data))
    
    people_contact.append([social_answer_set[0 if res['number'] not in social_choice_set else int(res['number'])] for res in social_data if 'number' in res])

In [6]:
# Activity json 데이터 파싱
# 'working', 'relaxing' 'other_working', 'other_relaxing' 질문에 해당하는 값을 미리 배열을 만들어서 응답 파싱

people_work_alone = [] 
people_work_other = []
people_relaxing_alone = [] 
people_relaxing_other = []

people_answer_set = ['NaN', 0, 0.11, 0.26, 0.51, 0.76]
people_choice_set = [str(x) for x in range(1, 6)]


for act in activity:
    activity_data = json.load(open(act))
    
    people_work_alone.append([people_answer_set[0 if res['working'] not in people_choice_set else int(res['working'])] for res in activity_data if 'working' in res])
    people_work_other.append([people_answer_set[0 if res['other_working'] not in people_choice_set else int(res['other_working'])] for res in activity_data if 'other_working' in res])
    people_relaxing_alone.append([people_answer_set[0 if res['relaxing'] not in people_choice_set else int(res['relaxing'])] for res in activity_data if 'relaxing' in res])
    people_relaxing_other.append([people_answer_set[0 if res['other_relaxing'] not in people_choice_set else int(res['other_relaxing'])] for res in activity_data if 'other_relaxing' in res])

In [7]:
# 파싱한 데이터에 유저행 합치기

student_sleep = pd.concat([total_user, pd.DataFrame(people_hour).astype('float64')], axis=1)
student_sleep_rate = pd.concat([total_user, pd.DataFrame(people_rate).astype('float64')], axis=1)
student_contact = pd.concat([total_user, pd.DataFrame(people_contact).astype('float64')], axis=1)
student_working_alone = pd.concat([total_user, pd.DataFrame(people_work_alone).astype('float64')], axis=1)
student_working_other = pd.concat([total_user, pd.DataFrame(people_work_other).astype('float64')], axis=1)
student_relaxing_alone = pd.concat([total_user, pd.DataFrame(people_relaxing_alone).astype('float64')], axis=1)
student_relaxing_other = pd.concat([total_user, pd.DataFrame(people_relaxing_other).astype('float64')], axis=1)

In [8]:
# pd.DataFrame().describe() 를 활용한 통계적 feature 추출
# train과 test 유저들을 다시 분리하면서 진행

train_features_student_sleep = student_sleep[student_sleep['user'].isin(train_user)].drop('user', axis=1).T.describe()
train_features_student_sleep_rate = student_sleep_rate[student_sleep_rate['user'].isin(train_user)].drop('user', axis=1).T.describe()
train_features_student_contact = student_contact[student_contact['user'].isin(train_user)].drop('user', axis=1).T.describe()
train_features_student_working_alone = student_working_alone[student_working_alone['user'].isin(train_user)].drop('user', axis=1).T.describe()
train_features_student_working_other = student_working_other[student_working_other['user'].isin(train_user)].drop('user', axis=1).T.describe()
train_features_student_relaxing_alone = student_relaxing_alone[student_relaxing_alone['user'].isin(train_user)].drop('user', axis=1).T.describe()
train_features_student_relaxing_other = student_relaxing_other[student_relaxing_other['user'].isin(train_user)].drop('user', axis=1).T.describe()

test_features_student_sleep = student_sleep[student_sleep['user'].isin(test_user)].drop('user', axis=1).T.describe()
test_features_student_sleep_rate = student_sleep_rate[student_sleep_rate['user'].isin(test_user)].drop('user', axis=1).T.describe()
test_features_student_contact = student_contact[student_contact['user'].isin(test_user)].drop('user', axis=1).T.describe()
test_features_student_working_alone = student_working_alone[student_working_alone['user'].isin(test_user)].drop('user', axis=1).T.describe()
test_features_student_working_other = student_working_other[student_working_other['user'].isin(test_user)].drop('user', axis=1).T.describe()
test_features_student_relaxing_other = student_relaxing_other[student_relaxing_other['user'].isin(test_user)].drop('user', axis=1).T.describe()
test_features_student_relaxing_alone = student_relaxing_alone[student_relaxing_alone['user'].isin(test_user)].drop('user', axis=1).T.describe()

In [9]:
# NaN값들을 mean 값으로 채워줌

from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

train_features_student_sleep = imp_mean.fit_transform(train_features_student_sleep)
test_features_student_sleep = imp_mean.transform(test_features_student_sleep)

train_features_student_sleep_rate = imp_mean.fit_transform(train_features_student_sleep_rate)
test_features_student_sleep_rate = imp_mean.transform(test_features_student_sleep_rate)

train_features_student_contact = imp_mean.fit_transform(train_features_student_contact)
test_features_student_contact = imp_mean.transform(test_features_student_contact)

train_features_student_working_alone = imp_mean.fit_transform(train_features_student_working_alone)
test_features_student_working_alone = imp_mean.transform(test_features_student_working_alone)

train_features_student_working_other = imp_mean.fit_transform(train_features_student_working_other)
test_features_student_working_other = imp_mean.transform(test_features_student_working_other)

train_features_student_relaxing_alone = imp_mean.fit_transform(train_features_student_relaxing_alone)
test_features_student_relaxing_alone = imp_mean.transform(test_features_student_relaxing_alone)

train_features_student_relaxing_other = imp_mean.fit_transform(train_features_student_relaxing_other)
test_features_student_relaxing_other = imp_mean.transform(test_features_student_relaxing_other)

In [17]:
# 각 feature들중에서 count를 제외한 나머지 값들을 사용

train_features = [train_features_student_sleep,
train_features_student_sleep_rate,
train_features_student_contact,
train_features_student_working_alone,
train_features_student_working_other,
train_features_student_relaxing_alone,
train_features_student_relaxing_other]

test_features = [test_features_student_sleep,
test_features_student_sleep_rate,
test_features_student_contact,
test_features_student_working_alone,
test_features_student_working_other,
test_features_student_relaxing_alone,+
test_features_student_relaxing_other
]

X_train = np.array([[feature[1:8, i] for feature in train_features] for i in range(23)]).reshape(23,-1)
X_test = np.array([[feature[1:8, i] for feature in test_features] for i in range(23)]).reshape(23,-1)

In [13]:
from sklearn.svm import SVC


svm=SVC(class_weight='balanced', kernel='linear', C=1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

submit['label'] = y_pred
submit.to_csv("submit2.csv", index=False)