### 모듈 선언

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression


### 데이터 로드  
0-21 / 22-23 으로 데이터 저장

In [None]:
def get_ds_infos():
    """
    Read the file includes data subject information.

    Data Columns:
    0: code [1-24]
    1: weight [kg]
    2: height [cm]
    3: age [years]
    4: gender [0:Female, 1:Male]

    Returns:
        A pandas DataFrame that contains inforamtion about data subjects' attributes
    """

    dss = pd.read_csv("data_subjects_info.csv")
    print("[INFO] -- Data subjects' information is imported.")

    return dss

def set_data_types(data_types=["userAcceleration"]):
    """
    Select the sensors and the mode to shape the final dataset.

    Args:
        data_types: A list of sensor data type from this list: [attitude, gravity, rotationRate, userAcceleration]

    Returns:
        It returns a list of columns to use for creating time-series from files.
    """
    dt_list = []
    for t in data_types:
        if t != "attitude":
            dt_list.append([t+".x",t+".y",t+".z"])
        else:
            dt_list.append([t+".roll", t+".pitch", t+".yaw"])

    return dt_list


def creat_time_series(dt_list, act_labels, trial_codes, mode="mag", labeled=True):
    """
    Args:
        dt_list: A list of columns that shows the type of data we want.
        act_labels: list of activites
        trial_codes: list of trials
        mode: It can be "raw" which means you want raw data
        for every dimention of each data type,
        [attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)].
        or it can be "mag" which means you only want the magnitude for each data type: (x^2+y^2+z^2)^(1/2)
        labeled: True, if we want a labeld dataset. False, if we only want sensor values.

    Returns:
        It returns a time-series of sensor data and verification data.
    """
    num_data_cols = len(dt_list) if mode == "mag" else len(dt_list*3)

    if labeled:
        dataset = np.zeros((0, num_data_cols + 7))  # "7" --> [act, code, weight, height, age, gender, trial]
    else:
        dataset = np.zeros((0, num_data_cols))

    ds_list = get_ds_infos()

    print("[INFO] -- Creating Time-Series")
    for sub_id in ds_list["code"]:
        for act_id, act in enumerate(act_labels):
            for trial in trial_codes[act_id]:
                fname = 'A_DeviceMotion_data/' + act + '_' + str(trial) + '/sub_' + str(int(sub_id)) + '.csv'
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                vals = np.zeros((len(raw_data), num_data_cols))
                for x_id, axes in enumerate(dt_list):
                    if mode == "mag":
                        vals[:, x_id] = (raw_data[axes] ** 2).sum(axis=1) ** 0.5
                    else:
                        vals[:, x_id * 3:(x_id + 1) * 3] = raw_data[axes].values
                    vals = vals[:, :num_data_cols]
                if labeled:
                    lbls = np.array([[act_id,
                                      sub_id - 1,
                                      ds_list["weight"][sub_id - 1],
                                      ds_list["height"][sub_id - 1],
                                      ds_list["age"][sub_id - 1],
                                      ds_list["gender"][sub_id - 1],
                                      trial
                                      ]] * len(raw_data))
                    vals = np.concatenate((vals, lbls), axis=1)
                dataset = np.append(dataset, vals, axis=0)
    cols = []
    for axes in dt_list:
        if mode == "raw":
            cols += axes
        else:
            cols += [str(axes[0][:-2])]

    if labeled:
        cols += ["act", "id", "weight", "height", "age", "gender", "trial"]

    dataset = pd.DataFrame(data=dataset, columns=cols)

    # id가 22와 23인 데이터를 제거하고 별도로 저장
    dataset_ver = dataset[(dataset['id'] == 22) | (dataset['id'] == 15)]
    dataset = dataset[(dataset['id'] != 22) & (dataset['id'] != 15)]

    return dataset, dataset_ver
#________________________________


ACT_LABELS = ["dws","ups", "wlk", "jog", "std", "sit"]
TRIAL_CODES = {
    ACT_LABELS[0]:[1,2,11],
    ACT_LABELS[1]:[3,4,12],
    ACT_LABELS[2]:[7,8,15],
    ACT_LABELS[3]:[9,16],
    ACT_LABELS[4]:[6,14],
    ACT_LABELS[5]:[5,13]
}

## Here we set parameter to build labeld time-series from dataset of "(A)DeviceMotion_data"
## attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)

sdt = ["attitude","gravity" , "rotationRate","userAcceleration"]
print("[INFO] -- Selected sensor data types: "+str(sdt))


act_labels = ACT_LABELS [2:3]
print("[INFO] -- == dataset_wlk ==")
print("[INFO] -- Selected activites: "+str(act_labels))
trial_codes = [TRIAL_CODES[act] for act in act_labels]
dt_list = set_data_types(sdt)
dataset_wlk, dataset_ver_wlk = creat_time_series(dt_list, act_labels, trial_codes, mode="raw", labeled=True)
print("[INFO] -- Shape of time-Series dataset:"+str(dataset_wlk.shape))
print("\n")

act_labels = ACT_LABELS [3:4]
print("[INFO] -- == dataset_jog ==")
print("[INFO] -- Selected activites: "+str(act_labels))
trial_codes = [TRIAL_CODES[act] for act in act_labels]
dt_list = set_data_types(sdt)
dataset_jog, dataset_ver_jog = creat_time_series(dt_list, act_labels, trial_codes, mode="raw", labeled=True)
print("[INFO] -- Shape of time-Series dataset:"+str(dataset_jog.shape))
print("\n")

### 데이터 확인  

In [None]:
dataset_wlk

In [None]:
dataset_ver_wlk

### 데이터 전처리 및 결측값 확인 함수 선언

In [None]:
def split_dataset_by_id(dataset, dataset_name):
    # 각 id별로 데이터를 그룹화하여 딕셔너리에 저장하는 함수
    datasets_by_id = {}

    for id, group in dataset.groupby('id'):
        group_reset = group.reset_index(drop=True)  # 인덱스를 초기화
        datasets_by_id[f'{dataset_name}_{id}'] = group_reset

    return datasets_by_id

def split_by_trial(datasets_by_id):
    # 각 id별로 그룹화된 데이터에서 trial 별로 다시 그룹화하여 딕셔너리에 저장하는 함수
    datasets_by_id_and_trial = {}

    for name, data in datasets_by_id.items():
        id_part = name.split('_')[-1]
        for trial, group in data.groupby('trial'):
            key = f'{name}_trial_{trial}'
            datasets_by_id_and_trial[key] = group.reset_index(drop=True)

    return datasets_by_id_and_trial


def nan_check(datasets_by_id):
    # 데이터프레임에 결측치가 있는지 확인하는 함수
    all_no_missing = True

    for name, data in datasets_by_id.items():
        missing_values = data.isnull().sum()
        if missing_values.any():
            all_no_missing = False
            print(f"Missing values in {name}:")
            print(missing_values)
            print()

    if all_no_missing:
        print("== No missing value ==")

def print_lengths_by_id_and_trial(datasets_by_id_and_trial):
    # 각 id와 trial 별로 데이터프레임의 길이를 출력하는 함수
    ids = []
    trials = []
    lengths = []

    for name, df in datasets_by_id_and_trial.items():
        parts = name.split('_')
        id_part = parts[-3]
        trial_part = parts[-1]
        ids.append(id_part)
        trials.append(trial_part)
        lengths.append(len(df))

    # id, trial, 길이 정보를 문자열로 만들어 출력
    id_str_list = [f"id: {id}" for id in ids]
    trial_str_list = [f"trial: {trial}" for trial in trials]
    length_str_list = [str(length) for length in lengths]

    id_str = " | ".join([s.ljust(12) for s in id_str_list])
    trial_str = " | ".join([s.ljust(12) for s in trial_str_list])
    length_str = " | ".join([s.ljust(12) for s in length_str_list])

    print(id_str)
    print(trial_str)
    print(length_str)

    # 가장 긴 데이터프레임과 짧은 데이터프레임을 찾아 출력
    max_length = max(lengths)
    min_length = min(lengths)
    max_index = lengths.index(max_length)
    min_index = lengths.index(min_length)
    max_id = ids[max_index]
    max_trial = trials[max_index]
    min_id = ids[min_index]
    min_trial = trials[min_index]

    print(f"\nLongest: id: {max_id}, trial: {max_trial} with length {max_length}")
    print(f"Shortest: id: {min_id}, trial: {min_trial} with length {min_length}")

### 전처리한 데이터 확인

In [None]:
print("wlk dataset")
dataset_wlk_by_id = split_dataset_by_id(dataset_wlk, 'dataset_wlk')
dataset_wlk_by_id_and_trial = split_by_trial(dataset_wlk_by_id)
print_lengths_by_id_and_trial(dataset_wlk_by_id_and_trial)
nan_check(dataset_wlk_by_id_and_trial)

print("\n\njog dataset")
dataset_jog_by_id = split_dataset_by_id(dataset_jog, 'dataset_jog')
dataset_jog_by_id_and_trial = split_by_trial(dataset_jog_by_id)
print_lengths_by_id_and_trial(dataset_jog_by_id_and_trial)
nan_check(dataset_jog_by_id_and_trial)

print("\n\njog_ver dataset")
dataset_ver_wlk_by_id = split_dataset_by_id(dataset_ver_wlk, 'dataset_wlk')
dataset_ver_wlk_by_id_and_trial = split_by_trial(dataset_ver_wlk_by_id)
print_lengths_by_id_and_trial(dataset_ver_wlk_by_id_and_trial)
nan_check(dataset_ver_wlk_by_id_and_trial)

print("\n\njog_ver dataset")
dataset_ver_jog_by_id = split_dataset_by_id(dataset_ver_jog, 'dataset_jog')
dataset_ver_jog_by_id_and_trial = split_by_trial(dataset_ver_jog_by_id)
print_lengths_by_id_and_trial(dataset_ver_jog_by_id_and_trial)
nan_check(dataset_ver_jog_by_id_and_trial)

### 그래프 표출 함수

In [None]:
def plot_line(datasets, data_name, id, trial, data_type, x_end=None, x_start=None):
    # 데이터 이름 생성
    name_to_plot = f'{data_name}_{id}_trial_{trial}'

    # 데이터 선택 및 특정 열 선택
    if name_to_plot in datasets:
        columns_to_plot = [col.strip() for col in data_type.split(',')]
        data_to_plot = datasets[name_to_plot][columns_to_plot]

        # x_start와 x_end가 지정된 경우 해당 범위의 데이터 선택
        if x_start is not None and x_end is not None:
            data_to_plot = data_to_plot.iloc[x_start:x_end]
        elif x_end is not None:
            data_to_plot = data_to_plot.iloc[:x_end]

        # 라인 차트 출력
        data_to_plot.plot.line(title=f'Dataset for ID {id} and Trial {trial} in {data_name}', figsize=(14, 4))

        plt.xlabel('Index')
        plt.ylabel('Values')
        plt.show()
    else:
        print(f"ID {id} with Trial {trial} not found in the datasets of type {data_name}.")

def plot_heatmap(datasets, data_name, id, trial, data_type):
    # 데이터 이름 생성
    name_to_plot = f'{data_name}_{id}_trial_{trial}'

    # 데이터 선택 및 특정 열 선택
    if name_to_plot in datasets:
        columns_to_plot = [col.strip() for col in data_type.split(',')]
        data_to_plot = datasets[name_to_plot][columns_to_plot]

        # 데이터 타입 변환 및 결측값 처리
        data_to_plot = data_to_plot.apply(pd.to_numeric, errors='coerce')
        data_to_plot = data_to_plot.dropna()

        if data_to_plot.empty:
            print(f"No valid numeric data available for {name_to_plot}")
            return

        # 상관계수 계산
        corr = data_to_plot.corr()

        # 히트맵 출력
        fig, ax = plt.subplots(figsize=(10, 8))
        im = ax.imshow(corr.values, cmap='coolwarm')

        # 라벨 선택
        ax.set_xticks(np.arange(len(corr.columns)))
        ax.set_yticks(np.arange(len(corr.columns)))
        ax.set_xticklabels(corr.columns)
        ax.set_yticklabels(corr.columns)

        # 눈금 라벨 회전 및 정렬 설정
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")

        # 데이터 차원별로 텍스트 주석 생성
        for i in range(len(corr.columns)):
            for j in range(len(corr.columns)):
                text = ax.text(j, i, np.around(corr.iloc[i, j], decimals=2),
                               ha="center", va="center", color="black")

        # 컬러바 추가
        cbar = ax.figure.colorbar(im, ax=ax, cmap='coolwarm')
        cbar.ax.set_ylabel("Correlation", rotation=-90, va="bottom")

        plt.title(f'Heatmap for ID {id} and Trial {trial} in {data_name}')
        plt.show()
    else:
        print(f"ID {id} with Trial {trial} not found in the datasets of type {data_name}.")


### Datatype 간 비교

In [None]:
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "attitude.roll, attitude.pitch, attitude.yaw")
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "gravity.x, gravity.y, gravity.z")
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "rotationRate.x, rotationRate.y, rotationRate.z")
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "userAcceleration.x, userAcceleration.y, userAcceleration.z")

In [None]:
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '2.0', '7.0', "attitude.roll, attitude.pitch, attitude.yaw", 500)
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "gravity.x, gravity.y, gravity.z", 500)
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "rotationRate.x, rotationRate.y, rotationRate.z", 500)
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "userAcceleration.x, userAcceleration.y, userAcceleration.z", 500)

In [None]:
plot_line(dataset_jog_by_id_and_trial, 'dataset_jog', '1.0', '9.0', "gravity.z")
plot_line(dataset_jog_by_id_and_trial, 'dataset_jog', '11.0', '9.0', "gravity.z")
plot_line(dataset_jog_by_id_and_trial, 'dataset_jog', '11.0', '16.0', "gravity.z")

#### 상관 계수가 높은 데이터 비교

In [None]:
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "rotationRate.x, rotationRate.y", 500)
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "userAcceleration.x, userAcceleration.y", 500)
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "attitude.roll, gravity.z", 500)
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "gravity.y", 500)
plot_line(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "attitude.pitch", 500)

In [None]:
plot_heatmap(dataset_wlk_by_id_and_trial, 'dataset_wlk', '1.0', '7.0', "attitude.roll, attitude.pitch, attitude.yaw,\
              gravity.x, gravity.y, gravity.z, rotationRate.x, rotationRate.y, rotationRate.z,\
              userAcceleration.x, userAcceleration.y, userAcceleration.z")

In [None]:
plot_heatmap(dataset_jog_by_id_and_trial, 'dataset_jog', '1.0', '9.0', "attitude.roll, attitude.pitch, attitude.yaw,\
              gravity.x, gravity.y, gravity.z, rotationRate.x, rotationRate.y, rotationRate.z,\
              userAcceleration.x, userAcceleration.y, userAcceleration.z")

### 데이터 가공 및 주기 확인

In [None]:
def plot_line_peak(datasets, data_name, id, trial, data_type, x_end=None, x_start=None):
    # 데이터 이름 생성
    name_to_plot = f'{data_name}_{id}_trial_{trial}'

    # 데이터 선택 및 특정 열 선택
    if name_to_plot in datasets:
        columns_to_plot = [col.strip() for col in data_type.split(',')]
        data_to_plot = datasets[name_to_plot][columns_to_plot]

        # 피크 찾기
        peaks, _ = find_peaks(data_to_plot.iloc[:, 0], distance=10, prominence=0.15, width=1)

        # 라인 차트 출력
        plt.figure(figsize=(14, 4))
        plt.plot(data_to_plot.index, data_to_plot.iloc[:, 0], label='Data')
        plt.plot(data_to_plot.index[peaks], data_to_plot.iloc[peaks, 0], "x", label='Peaks')

        # x_start와 x_end가 지정된 경우 해당 범위의 데이터 선택
        if x_start is not None and x_end is not None:
            plt.xlim(x_start, x_end)
        elif x_end is not None:
            plt.xlim(0, x_end)

        plt.title(f'Dataset for ID {id} and Trial {trial} in {data_name}')
        plt.xlabel('Index')
        plt.ylabel('Values')
        plt.legend()
        plt.show()
    else:
        print(f"ID {id} with Trial {trial} not found in the datasets of type {data_name}.")

# 에너지 계산 함수
def calculate_energy(data):
    data['energy'] = np.sqrt(data['userAcceleration.x']**2 + data['userAcceleration.y']**2 + data['userAcceleration.z']**2)
    return data

# 피크 간 적분된 에너지 계산 함수 (gravity.z 사용)
def integrate_energy_over_intervals(data, peaks):
    integrated_values = []
    for start, end in zip(peaks[:-1], peaks[1:]):
        integrated_value = np.trapz(data['energy'][start:end])
        integrated_values.append(integrated_value)
    return np.array(integrated_values)

# 데이터셋 처리 함수
def process_datasets(datasets):
    processed_data = {}
    for name, df in datasets.items():
        df = calculate_energy(df)
        peaks, _ = find_peaks(df['gravity.z'], distance=10, prominence=0.15, width=1)
        integrated_values = integrate_energy_over_intervals(df, peaks)
        processed_data[name] = integrated_values
    return processed_data

# 평균에 가장 가까운 값을 선택하는 함수
def select_closest_to_mean(data, percentage=0.8):
    mean_value = np.mean(data)
    distances = np.abs(data - mean_value)
    sorted_indices = np.argsort(distances)
    cutoff = int(len(data) * percentage)
    selected_indices = sorted_indices[:cutoff]
    return data[selected_indices]

# 데이터 처리 및 선택 함수
def process_and_select_data(datasets):
    selected_data = {}
    for name, data in datasets.items():
        selected_values = select_closest_to_mean(data)
        selected_data[name] = np.mean(selected_values)
    return selected_data

# 피크 찾기 및 주기 계산
def calculate_peak_intervals(data):
    peak_intervals = {}
    for name, df in data.items():
        peaks, _ = find_peaks(df['gravity.z'], distance=10, prominence=0.15, width=1)
        if len(peaks) > 1:
            intervals = np.diff(peaks)
            intervals = select_closest_to_mean(intervals, percentage=0.8)
            mean_interval = np.mean(intervals)
            peak_intervals[name] = mean_interval
        else:
            peak_intervals[name] = None
    return peak_intervals

In [None]:
corrected_datasets = {}
for name, df in dataset_wlk_by_id_and_trial.items():
    corrected_df = calculate_energy(df)
    corrected_datasets[name] = corrected_df

for i in range(24):
    for j in ['7.0', '8.0', '15.0']:
        plot_line_peak(corrected_datasets, 'dataset_wlk', '{:.1f}'.format(i), j, 'gravity.z')

In [None]:
corrected_datasets = {}
for name, df in dataset_jog_by_id_and_trial.items():
    corrected_df = calculate_energy(df)
    corrected_datasets[name] = corrected_df

for i in range(24):
    for j in ['9.0', '16.0']:
        plot_line_peak(corrected_datasets, 'dataset_jog', '{:.1f}'.format(i), j, 'gravity.z')

In [None]:
peak_intervals_wlk = calculate_peak_intervals(dataset_wlk_by_id_and_trial)
peak_intervals_jog = calculate_peak_intervals(dataset_jog_by_id_and_trial)


# 유효한 주기 데이터만 필터링
wlk_intervals = [interval for interval in peak_intervals_wlk.values() if interval is not None]
jog_intervals = [interval for interval in peak_intervals_jog.values() if interval is not None]

# Box plot 그리기
plt.figure(figsize=(12, 6))
plt.boxplot([wlk_intervals, jog_intervals], labels=['Walking', 'Jogging'])
plt.xlabel('Activity')
plt.ylabel('Mean Peak Interval')
plt.title('Box Plot of Mean Peak Intervals for Walking and Jogging')
plt.show()

In [None]:
processed_datasets_wlk = process_datasets(dataset_wlk_by_id_and_trial)
selected_data_wlk = process_and_select_data(processed_datasets_wlk)

processed_datasets_jog = process_datasets(dataset_wlk_by_id_and_trial)
selected_data_jog = process_and_select_data(processed_datasets_jog)

# 동작별로 에너지 데이터 모으기 및 평균 계산
energy_wlk = []
energy_jog = []

for name, energy in selected_data_wlk.items():
    energy_wlk.append(energy)

for name, energy in selected_data_jog.items():
    energy_jog.append(energy)

# Boxplot 그리기
plt.figure(figsize=(12, 6))
plt.boxplot([energy_wlk, energy_jog], labels=['Walking', 'Jogging'])
plt.xlabel('Activity')
plt.ylabel('Integrated Energy')
plt.title('Boxplot of Integrated Energy by Activity')
plt.show()

### 모델 예측 실시

### wlk 기본

In [None]:
processed_datasets = process_datasets(dataset_wlk_by_id_and_trial)
selected_data = process_and_select_data(processed_datasets)

# 에너지와 키 데이터를 준비
X = []
y = []
ids = []

for name, energy in selected_data.items():
    id_value = (name.split('_')[2])
    height = dataset_wlk_by_id_and_trial[name]['height'].iloc[0]  # 'height'를 직접 사용
    X.append([energy])
    y.append(height)
    ids.append(id_value)

X = np.array(X)
y = np.array(y)
ids = np.array(ids)

##### scatter 그래프 확인

In [None]:
# 산점도 그리기
plt.figure(figsize=(12, 6))
plt.scatter(X, y, c='blue', label='Data Points')

# 각 점에 id 표시
for i, txt in enumerate(ids):
    plt.annotate(txt, (X[i], y[i]), textcoords="offset points", xytext=(5, 5), ha='center')

plt.xlabel('Mean Energy')
plt.ylabel('Height')
plt.title('Energy vs. Height')
plt.legend()
plt.show()

In [None]:
def polynomial_regression_analysis(X, y, ids, degree=1, test_size=0.2, num_iterations=1000):
    # 데이터 스케일링

    r2_scores = []
    mse_scores = []
    models = []

    for _ in range(num_iterations):
        # 데이터 분할
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=np.random.randint(10000))

        # 다항 회귀 모델 학습
        polynomial_features = PolynomialFeatures(degree=degree)
        model = make_pipeline(polynomial_features, LinearRegression())
        model.fit(X_train, y_train)
        models.append(model)

        # 예측 및 평가
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mse_scores.append(mse)
        r2_scores.append(r2)

    # 결과 출력
    best_index = np.argmax(r2_scores)
    best_model = models[best_index]

    print(f'Best R^2 Score: {r2_scores[best_index]:.2f}')
    print(f'Corresponding Mean Squared Error: {mse_scores[best_index]:.2f}')

    # 기존 데이터 산점도 및 예측 곡선 그리기
    plt.figure(figsize=(12, 6))
    plt.scatter(X[:, 0], y, c='blue', label='Data Points')

    # 각 점에 id 표시
    for i, txt in enumerate(ids):
        plt.annotate(txt, (X[i, 0], y[i]), textcoords="offset points", xytext=(5, 5), ha='center')

    # 최적 모델의 예측 곡선 그리기
    X_fit = np.linspace(X.min(), X.max(), 100)[:, np.newaxis]
    y_fit = best_model.predict(X_fit)
    plt.plot(X_fit, y_fit, color='red', label='Best Polynomial Fit')

    plt.xlabel('Mean Energy')
    plt.ylabel('Height')
    plt.title(f'Polynomial Regression model (degree = {degree})')
    plt.legend()
    plt.show()

### 선형 모델

##### 1차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=1)

##### 2차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=2)

##### 3차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=3)

### y축 양자화 적용

In [None]:
def quantize_height(height):
    if height < 165:
        return 163
    elif 165 <= height < 170:
        return 168
    elif 170 <= height < 175:
        return 173
    elif 175 <= height < 180:
        return 178
    elif 180 <= height < 185:
        return 183
    else:
        return 188

In [None]:
# 학습 데이터 처리 및 선택
processed_datasets = process_datasets(dataset_wlk_by_id_and_trial)
selected_data = process_and_select_data(processed_datasets)

# 에너지와 키 데이터를 준비
X = []
y = []
ids = []

for name, energy in selected_data.items():
    id_value = (name.split('_')[2])
    height = dataset_wlk_by_id_and_trial[name]['height'].iloc[0]  # 'height'를 직접 사용
    quantized_height = quantize_height(height)
    X.append([energy])
    y.append(quantized_height)
    ids.append(id_value)

X = np.array(X)
y = np.array(y)
ids = np.array(ids)

##### 1차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=1)

##### 2차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=2)

##### 3차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=3)

### 조깅 확인

In [None]:
processed_datasets = process_datasets(dataset_jog_by_id_and_trial)
selected_data = process_and_select_data(processed_datasets)

# 에너지와 키 데이터를 준비
X = []
y = []
ids = []

for name, energy in selected_data.items():
    id_value = (name.split('_')[2])
    height = dataset_jog_by_id_and_trial[name]['height'].iloc[0]  # 'height'를 직접 사용
    X.append([energy])
    y.append(height)
    ids.append(id_value)

X = np.array(X)
y = np.array(y)
ids = np.array(ids)

##### scatter 확인

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X, y, c='blue', label='Data Points')

# 각 점에 id 표시
for i, txt in enumerate(ids):
    plt.annotate(txt, (X[i], y[i]), textcoords="offset points", xytext=(5, 5), ha='center')

plt.xlabel('Mean Energy')
plt.ylabel('Height')
plt.title('Scatter Plot of Energy vs. Height with IDs')
plt.legend()
plt.show()

##### 1차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=1)

##### 2차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=2)

##### 3차 선형 모델

In [None]:
polynomial_regression_analysis(X, y, ids, degree=3)

### jog, wlk 결합

##### logistic regression

In [None]:
peak_intervals_wlk = calculate_peak_intervals(dataset_wlk_by_id_and_trial)
peak_intervals_jog = calculate_peak_intervals(dataset_jog_by_id_and_trial)

# 주기 및 행동 데이터 준비
X_intervals = np.array(list(peak_intervals_wlk.values()) + list(peak_intervals_jog.values())).reshape(-1, 1)
y_behaviors = np.array([0]*len(peak_intervals_wlk) + [1]*len(peak_intervals_jog))

# 데이터 스케일링
scaler = StandardScaler()
X_intervals_scaled = scaler.fit_transform(X_intervals.reshape(-1, 1))

# Logistic Regression 모델 학습
logistic_model = LogisticRegression()
logistic_model.fit(X_intervals_scaled, y_behaviors)

# 예측 및 평가
behavior_labels = logistic_model.predict(X_intervals_scaled)

# 클러스터링 결과 시각화 (행동 구분)
plt.figure(figsize=(12, 6))
colors = ['blue', 'green']
for behavior in np.unique(behavior_labels):
    behavior_indices = np.where(behavior_labels == behavior)
    X_behavior = X_intervals[behavior_indices]
    y_behavior = y_behaviors[behavior_indices]
    plt.scatter(X_behavior[:, 0], y_behavior, c=colors[behavior-1], label=f'Behavior {behavior}')

    # 로지스틱 회귀 예측 선 그리기
X_fit = np.linspace(X_intervals_scaled.min(), X_intervals_scaled.max(), 300)
y_prob = logistic_model.predict_proba(X_fit.reshape(-1, 1))[:, 1]  # 클래스 1의 확률
plt.plot(scaler.inverse_transform(X_fit.reshape(-1, 1)), y_prob, color='red', linewidth=2, label='Logistic Regression Fit')

plt.xlabel('Mean Interval')
plt.ylabel('Behavior')
plt.title('Logistic Regression (Behavior 0 = jog, Behavior 1 = wlk)')
plt.legend()
plt.show()

##### 1차 선형 학습 모델 (1차)

In [None]:
processed_datasets_wlk = process_datasets(dataset_wlk_by_id_and_trial)
processed_datasets_jog = process_datasets(dataset_jog_by_id_and_trial)
selected_data_wlk = process_and_select_data(processed_datasets_wlk)
selected_data_jog = process_and_select_data(processed_datasets_jog)

#선형 회귀를 위한 에너지 및 키 데이터 준비
X_energy = []
y_energy = []
ids = []

for name, energy in selected_data_wlk.items():
    id_value = name.split('_')[2]
    height = dataset_wlk_by_id_and_trial[name]['height'].iloc[0]
    X_energy.append(energy)
    y_energy.append(height)
    ids.append(id_value)

for name, energy in selected_data_jog.items():
    id_value = name.split('_')[2]
    height = dataset_jog_by_id_and_trial[name]['height'].iloc[0]
    X_energy.append(energy)
    y_energy.append(height)
    ids.append(id_value)

X_energy = np.array(X_energy).reshape(-1, 1)
y_energy = np.array(y_energy)
ids = np.array(ids)

#각 행동 그룹에 대해 선형 회귀 모델 학습
num_iterations = 1000
best_models = []

for behavior in np.unique(behavior_labels):
    behavior_indices = np.where(behavior_labels == behavior)
    X_behavior = X_energy[behavior_indices]
    y_behavior = y_energy[behavior_indices]

    best_r2_score = -np.inf
    best_model = None

    for _ in range(num_iterations):
        # 데이터 분할
        X_train, X_test, y_train, y_test = train_test_split(X_behavior, y_behavior, test_size=0.2, random_state=np.random.randint(10000))

        # 선형 회귀 모델 학습
        model = LinearRegression()
        model.fit(X_train, y_train)

        # 예측 및 평가
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)

        if r2 > best_r2_score:
            best_r2_score = r2
            best_model = model

    best_models.append(best_model)
    print(f'Best R^2 Score for Behavior {behavior}: {best_r2_score:.2f}')

    # 선형 모델의 함수 출력
    coef = best_model.coef_[0]
    intercept = best_model.intercept_
    print(f'Linear Model for Behavior {behavior}: y = {coef:.4f} * x + {intercept:.4f}')

    # 기존 데이터 산점도 및 예측 곡선 그리기
    plt.figure(figsize=(12, 6))
    plt.scatter(X_behavior, y_behavior, c=colors[behavior-1], label=f'Behavior {behavior} Data')

    # 최적 모델의 예측 곡선 그리기
    X_fit = np.linspace(X_behavior.min(), X_behavior.max(), 100).reshape(-1, 1)
    y_fit = best_model.predict(X_fit)
    plt.plot(X_fit, y_fit, color='red', label='Best Linear Fit')

    plt.xlabel('Energy')
    plt.ylabel('Height')
    plt.title(f'Linear Regression (Behavior {behavior})')
    plt.legend()
    plt.show()

##### 1차 선형 모델 (2차)

In [None]:
def remove_outliers(X, y, model, threshold_multiplier=3.5):
    y_pred = model.predict(X)
    residuals = np.abs(y - y_pred)
    threshold = threshold_multiplier * np.std(residuals)
    cleaned_indices = residuals <= threshold
    return X[cleaned_indices], y[cleaned_indices]

# 이상치 제거 및 2차 모델 학습
for idx, behavior in enumerate(np.unique(behavior_labels)):
    behavior_indices = np.where(behavior_labels == behavior)
    X_behavior = X_energy[behavior_indices]
    y_behavior = y_energy[behavior_indices]

    # 이전 셀에서 학습한 최적 모델을 사용하여 이상치 제거
    best_model = best_models[idx]
    X_cleaned, y_cleaned = remove_outliers(X_behavior, y_behavior, best_model)

    best_r2_score_cleaned = -np.inf
    best_model_cleaned = None

    for _ in range(num_iterations):
        # 데이터 분할
        X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=np.random.randint(10000))

        # 선형 회귀 모델 학습
        model_cleaned = LinearRegression()
        model_cleaned.fit(X_train, y_train)

        # 예측 및 평가
        y_pred_cleaned = model_cleaned.predict(X_test)
        r2_cleaned = r2_score(y_test, y_pred_cleaned)

        if r2_cleaned > best_r2_score_cleaned:
            best_r2_score_cleaned = r2_cleaned
            best_model_cleaned = model_cleaned

    print(f'Best R^2 Score for Behavior {behavior} after removing outliers: {best_r2_score_cleaned:.2f}')

    # 선형 모델의 함수 출력
    coef = best_model_cleaned.coef_[0]
    intercept = best_model_cleaned.intercept_
    print(f'Linear Model for Behavior {behavior} after removing outliers: y = {coef:.4f} * x + {intercept:.4f}')

    # 최적 모델의 예측 곡선 그리기
    plt.figure(figsize=(12, 6))
    plt.scatter(X_cleaned, y_cleaned, color=colors[behavior-1], label=f'Behavior {behavior} Data after removing outliers')

    # 최적 모델의 예측 곡선 그리기
    X_fit = np.linspace(X_cleaned.min(), X_cleaned.max(), 100).reshape(-1, 1)
    y_fit = best_model_cleaned.predict(X_fit)
    plt.plot(X_fit, y_fit, color='red', label='Best Linear Fit After Removing Outliers')

    plt.xlabel('Scaled Energy')
    plt.ylabel('Height')
    plt.title(f'Linear Regression After Removing Outliers (Behavior {behavior})')
    plt.legend()
    plt.show()

### 모델 검증

In [None]:
# 새로운 데이터셋을 사용하여 검증 (dataset_ver_wlk_by_id_and_trial 및 dataset_ver_jog_by_id_and_trial)
new_processed_datasets_wlk = process_datasets(dataset_ver_wlk_by_id_and_trial)
new_processed_datasets_jog = process_datasets(dataset_ver_jog_by_id_and_trial)
new_selected_data_wlk = process_and_select_data(new_processed_datasets_wlk)
new_selected_data_jog = process_and_select_data(new_processed_datasets_jog)


# 새로운 데이터 준비
X_new = []
y_new = []
ids_new = []

for name, energy in new_selected_data_wlk.items():
    id_value = name.split('_')[2]
    height = dataset_ver_wlk_by_id_and_trial[name]['height'].iloc[0]
    X_new.append(energy)
    y_new.append(height)
    ids_new.append(id_value)

for name, energy in new_selected_data_jog.items():
    id_value = name.split('_')[2]
    height = dataset_ver_jog_by_id_and_trial[name]['height'].iloc[0]
    X_new.append(energy)
    y_new.append(height)
    ids_new.append(id_value)

X_new = np.array(X_new).reshape(-1, 1)
y_new = np.array(y_new)
ids_new = np.array(ids_new)

# 새로운 데이터 클러스터링 예측 (주기 및 키 기반)
new_peak_intervals = calculate_peak_intervals({**dataset_ver_wlk_by_id_and_trial, **dataset_ver_jog_by_id_and_trial})
X_new_intervals = np.array(list(new_peak_intervals.values())).reshape(-1, 1)
new_behaviors = logistic_model.predict(scaler.transform(X_new_intervals))

# 새로운 데이터 행동 그룹별 선형 회귀 모델 적용 및 예측 결과 출력
colors = {0: 'blue', 1: 'green'}
labels = {0: 'Walking', 1: 'Jogging'}

plt.figure(figsize=(12, 6))
X_range = np.linspace(10, 80, 100).reshape(-1, 1)

for behavior in np.unique(new_behaviors):
    behavior_indices = np.where(new_behaviors == behavior)
    X_behavior = X_new[behavior_indices]
    y_behavior = y_new[behavior_indices]
    best_model = best_models[behavior]

    plt.scatter(X_behavior, y_behavior, color=colors[behavior], label=f'{labels[behavior]} Data')

    # 모델의 예측 선을 그리기
    y_fit = best_model.predict(X_range)
    plt.plot(X_range, y_fit, color=colors[behavior], label=f'{labels[behavior]} Model')

plt.xlabel('Energy')
plt.ylabel('Height')
plt.xlim(10, 80)
plt.ylim(150, 190)
plt.title('RESULT')
plt.legend()
plt.show()

# 각 데이터 포인트에 대해 예측 결과 출력
for i, (energy, true_height, behavior, id_value) in enumerate(zip(X_new, y_new, new_behaviors, ids_new)):
    best_model = best_models[behavior]
    pred_height = best_model.predict(energy.reshape(1, -1))[0]

    behavior_label = "Jogging" if behavior == 1 else "Walking"
    print(f"ID: {id_value}  |  Behavior: {behavior_label}  |  True Height: {true_height}  |  Predicted Height: {pred_height:.2f}")

##### 추가적인 검증

In [None]:
def result(dataset_wlk_by_id_and_trial, dataset_jog_by_id_and_trial, dataset_ver_wlk_by_id_and_trial, dataset_ver_jog_by_id_and_trial, a, b):
    peak_intervals_wlk = calculate_peak_intervals(dataset_wlk_by_id_and_trial)
    peak_intervals_jog = calculate_peak_intervals(dataset_jog_by_id_and_trial)

    # 주기 및 행동 데이터 준비
    X_intervals = np.array(list(peak_intervals_wlk.values()) + list(peak_intervals_jog.values())).reshape(-1, 1)
    y_behaviors = np.array([0]*len(peak_intervals_wlk) + [1]*len(peak_intervals_jog))

    # 데이터 스케일링
    scaler = StandardScaler()
    X_intervals_scaled = scaler.fit_transform(X_intervals.reshape(-1, 1))

    # Logistic Regression 모델 학습
    logistic_model = LogisticRegression()
    logistic_model.fit(X_intervals_scaled, y_behaviors)

    # 예측 및 평가
    behavior_labels = logistic_model.predict(X_intervals_scaled)

    processed_datasets_wlk = process_datasets(dataset_wlk_by_id_and_trial)
    processed_datasets_jog = process_datasets(dataset_jog_by_id_and_trial)
    selected_data_wlk = process_and_select_data(processed_datasets_wlk)
    selected_data_jog = process_and_select_data(processed_datasets_jog)

    #선형 회귀를 위한 에너지 및 키 데이터 준비
    X_energy = []
    y_energy = []
    ids = []

    for name, energy in selected_data_wlk.items():
        id_value = name.split('_')[2]
        height = dataset_wlk_by_id_and_trial[name]['height'].iloc[0]
        X_energy.append(energy)
        y_energy.append(height)
        ids.append(id_value)

    for name, energy in selected_data_jog.items():
        id_value = name.split('_')[2]
        height = dataset_jog_by_id_and_trial[name]['height'].iloc[0]
        X_energy.append(energy)
        y_energy.append(height)
        ids.append(id_value)

    X_energy = np.array(X_energy).reshape(-1, 1)
    y_energy = np.array(y_energy)
    ids = np.array(ids)

    #각 행동 그룹에 대해 선형 회귀 모델 학습
    num_iterations = 1000
    best_models = []

    for behavior in np.unique(behavior_labels):
        behavior_indices = np.where(behavior_labels == behavior)
        X_behavior = X_energy[behavior_indices]
        y_behavior = y_energy[behavior_indices]

        best_r2_score = -np.inf
        best_model = None

        for _ in range(num_iterations):
            # 데이터 분할
            X_train, X_test, y_train, y_test = train_test_split(X_behavior, y_behavior, test_size=0.1, random_state=np.random.randint(10000))

            # 선형 회귀 모델 학습
            model = LinearRegression()
            model.fit(X_train, y_train)

            # 예측 및 평가
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)

            if r2 > best_r2_score:
                best_r2_score = r2
                best_model = model

        best_models.append(best_model)


    def remove_outliers(X, y, model, threshold_multiplier=3.5):
        y_pred = model.predict(X)
        residuals = np.abs(y - y_pred)
        threshold = threshold_multiplier * np.std(residuals)
        cleaned_indices = residuals <= threshold
        return X[cleaned_indices], y[cleaned_indices]

    # 이상치 제거 및 2차 모델 학습
    for idx, behavior in enumerate(np.unique(behavior_labels)):
        behavior_indices = np.where(behavior_labels == behavior)
        X_behavior = X_energy[behavior_indices]
        y_behavior = y_energy[behavior_indices]

        # 이전 셀에서 학습한 최적 모델을 사용하여 이상치 제거
        best_model = best_models[idx]
        X_cleaned, y_cleaned = remove_outliers(X_behavior, y_behavior, best_model)

        best_r2_score_cleaned = -np.inf
        best_model_cleaned = None

        for _ in range(num_iterations):
            # 데이터 분할
            X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.1, random_state=np.random.randint(10000))

            # 선형 회귀 모델 학습
            model_cleaned = LinearRegression()
            model_cleaned.fit(X_train, y_train)

            # 예측 및 평가
            y_pred_cleaned = model_cleaned.predict(X_test)
            r2_cleaned = r2_score(y_test, y_pred_cleaned)

            if r2_cleaned > best_r2_score_cleaned:
                best_r2_score_cleaned = r2_cleaned
                best_model_cleaned = model_cleaned

        print(f'Best R^2 Score for Behavior {behavior} after removing outliers: {best_r2_score_cleaned:.2f}')

        # 선형 모델의 함수 출력
        coef = best_model_cleaned.coef_[0]
        intercept = best_model_cleaned.intercept_

        if idx == 0:
            a.append(coef)
        else:
            b.append(coef)
        print(f'Linear Model for Behavior {behavior} after removing outliers: y = {coef:.4f} * x + {intercept:.4f}')


    # 새로운 데이터셋을 사용하여 검증 (dataset_ver_wlk_by_id_and_trial 및 dataset_ver_jog_by_id_and_trial)
    new_processed_datasets_wlk = process_datasets(dataset_ver_wlk_by_id_and_trial)
    new_processed_datasets_jog = process_datasets(dataset_ver_jog_by_id_and_trial)
    new_selected_data_wlk = process_and_select_data(new_processed_datasets_wlk)
    new_selected_data_jog = process_and_select_data(new_processed_datasets_jog)


    # 새로운 데이터 준비
    X_new = []
    y_new = []
    ids_new = []

    for name, energy in new_selected_data_wlk.items():
        id_value = name.split('_')[2]
        height = dataset_ver_wlk_by_id_and_trial[name]['height'].iloc[0]
        X_new.append(energy)
        y_new.append(height)
        ids_new.append(id_value)

    for name, energy in new_selected_data_jog.items():
        id_value = name.split('_')[2]
        height = dataset_ver_jog_by_id_and_trial[name]['height'].iloc[0]
        X_new.append(energy)
        y_new.append(height)
        ids_new.append(id_value)

    X_new = np.array(X_new).reshape(-1, 1)
    y_new = np.array(y_new)
    ids_new = np.array(ids_new)

    # 새로운 데이터 클러스터링 예측 (주기 및 키 기반)
    new_peak_intervals = calculate_peak_intervals({**dataset_ver_wlk_by_id_and_trial, **dataset_ver_jog_by_id_and_trial})
    X_new_intervals = np.array(list(new_peak_intervals.values())).reshape(-1, 1)
    new_behaviors = logistic_model.predict(scaler.transform(X_new_intervals))

    # 새로운 데이터 행동 그룹별 선형 회귀 모델 적용 및 예측 결과 출력
    colors = {0: 'blue', 1: 'green'}
    labels = {0: 'Walking', 1: 'Jogging'}

    plt.figure(figsize=(12, 6))
    X_range = np.linspace(10, 80, 100).reshape(-1, 1)

    for behavior in np.unique(new_behaviors):
        behavior_indices = np.where(new_behaviors == behavior)
        X_behavior = X_new[behavior_indices]
        y_behavior = y_new[behavior_indices]
        best_model = best_models[behavior]

        plt.scatter(X_behavior, y_behavior, color=colors[behavior], label=f'{labels[behavior]} Data')

        # 모델의 예측 선을 그리기
        y_fit = best_model.predict(X_range)
        plt.plot(X_range, y_fit, color=colors[behavior], label=f'{labels[behavior]} Model')

    plt.xlabel('Energy')
    plt.ylabel('Height')
    plt.xlim(10, 80)
    plt.ylim(150, 190)
    plt.title('RESULT')
    plt.legend()
    plt.show()

    # 각 데이터 포인트에 대해 예측 결과 출력
    for i, (energy, true_height, behavior, id_value) in enumerate(zip(X_new, y_new, new_behaviors, ids_new)):
        best_model = best_models[behavior]
        pred_height = best_model.predict(energy.reshape(1, -1))[0]

        behavior_label = "Jogging" if behavior == 1 else "Walking"
        print(f"ID: {id_value}  |  Behavior: {behavior_label}  |  True Height: {true_height}  |  Predicted Height: {pred_height:.2f}")


def creat_time_series_seq(dt_list, act_labels, trial_codes, id, mode="mag", labeled=True):

    num_data_cols = len(dt_list) if mode == "mag" else len(dt_list*3)

    if labeled:
        dataset = np.zeros((0, num_data_cols + 7))  # "7" --> [act, code, weight, height, age, gender, trial]
    else:
        dataset = np.zeros((0, num_data_cols))

    ds_list = get_ds_infos()

    for sub_id in ds_list["code"]:
        for act_id, act in enumerate(act_labels):
            for trial in trial_codes[act_id]:
                fname = 'A_DeviceMotion_data/' + act + '_' + str(trial) + '/sub_' + str(int(sub_id)) + '.csv'
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                vals = np.zeros((len(raw_data), num_data_cols))
                for x_id, axes in enumerate(dt_list):
                    if mode == "mag":
                        vals[:, x_id] = (raw_data[axes] ** 2).sum(axis=1) ** 0.5
                    else:
                        vals[:, x_id * 3:(x_id + 1) * 3] = raw_data[axes].values
                    vals = vals[:, :num_data_cols]
                if labeled:
                    lbls = np.array([[act_id,
                                      sub_id - 1,
                                      ds_list["weight"][sub_id - 1],
                                      ds_list["height"][sub_id - 1],
                                      ds_list["age"][sub_id - 1],
                                      ds_list["gender"][sub_id - 1],
                                      trial
                                      ]] * len(raw_data))
                    vals = np.concatenate((vals, lbls), axis=1)
                dataset = np.append(dataset, vals, axis=0)
    cols = []
    for axes in dt_list:
        if mode == "raw":
            cols += axes
        else:
            cols += [str(axes[0][:-2])]

    if labeled:
        cols += ["act", "id", "weight", "height", "age", "gender", "trial"]

    dataset = pd.DataFrame(data=dataset, columns=cols)

    dataset_ver = dataset[(dataset['id'] == id)]
    dataset = dataset[(dataset['id'] != id)]

    return dataset, dataset_ver

a = []
b = []
for seq_id in range(24):
    act_labels = ACT_LABELS [2:3]
    trial_codes = [TRIAL_CODES[act] for act in act_labels]
    dt_list = set_data_types(sdt)
    dataset_wlk, dataset_ver_wlk = creat_time_series_seq(dt_list, act_labels, trial_codes, seq_id, mode="raw", labeled=True)

    act_labels = ACT_LABELS [3:4]
    trial_codes = [TRIAL_CODES[act] for act in act_labels]
    dt_list = set_data_types(sdt)
    dataset_jog, dataset_ver_jog = creat_time_series_seq(dt_list, act_labels, trial_codes, seq_id, mode="raw", labeled=True)

    dataset_wlk_by_id = split_dataset_by_id(dataset_wlk, 'dataset_wlk')
    dataset_wlk_by_id_and_trial = split_by_trial(dataset_wlk_by_id)

    dataset_jog_by_id = split_dataset_by_id(dataset_jog, 'dataset_jog')
    dataset_jog_by_id_and_trial = split_by_trial(dataset_jog_by_id)

    dataset_ver_wlk_by_id = split_dataset_by_id(dataset_ver_wlk, 'dataset_wlk')
    dataset_ver_wlk_by_id_and_trial = split_by_trial(dataset_ver_wlk_by_id)

    dataset_ver_jog_by_id = split_dataset_by_id(dataset_ver_jog, 'dataset_jog')
    dataset_ver_jog_by_id_and_trial = split_by_trial(dataset_ver_jog_by_id)

    result(dataset_wlk_by_id_and_trial, dataset_jog_by_id_and_trial, dataset_ver_wlk_by_id_and_trial, dataset_ver_jog_by_id_and_trial, a, b)
    print('\n')



In [None]:
plt.hist(a, bins=20, label='a')
print(np.mean(a))
print(np.std(a))

In [None]:
plt.hist(b, bins=20, label='b')
print(np.mean(b))
print(np.std(b))

##### 23번 데이터셋 제거

In [None]:
def result(dataset_wlk_by_id_and_trial, dataset_jog_by_id_and_trial, dataset_ver_wlk_by_id_and_trial, dataset_ver_jog_by_id_and_trial, a, b):
    peak_intervals_wlk = calculate_peak_intervals(dataset_wlk_by_id_and_trial)
    peak_intervals_jog = calculate_peak_intervals(dataset_jog_by_id_and_trial)

    # 주기 및 행동 데이터 준비
    X_intervals = np.array(list(peak_intervals_wlk.values()) + list(peak_intervals_jog.values())).reshape(-1, 1)
    y_behaviors = np.array([0]*len(peak_intervals_wlk) + [1]*len(peak_intervals_jog))

    # 데이터 스케일링
    scaler = StandardScaler()
    X_intervals_scaled = scaler.fit_transform(X_intervals.reshape(-1, 1))

    # Logistic Regression 모델 학습
    logistic_model = LogisticRegression()
    logistic_model.fit(X_intervals_scaled, y_behaviors)

    # 예측 및 평가
    behavior_labels = logistic_model.predict(X_intervals_scaled)

    processed_datasets_wlk = process_datasets(dataset_wlk_by_id_and_trial)
    processed_datasets_jog = process_datasets(dataset_jog_by_id_and_trial)
    selected_data_wlk = process_and_select_data(processed_datasets_wlk)
    selected_data_jog = process_and_select_data(processed_datasets_jog)

    #선형 회귀를 위한 에너지 및 키 데이터 준비
    X_energy = []
    y_energy = []
    ids = []

    for name, energy in selected_data_wlk.items():
        id_value = name.split('_')[2]
        height = dataset_wlk_by_id_and_trial[name]['height'].iloc[0]
        X_energy.append(energy)
        y_energy.append(height)
        ids.append(id_value)

    for name, energy in selected_data_jog.items():
        id_value = name.split('_')[2]
        height = dataset_jog_by_id_and_trial[name]['height'].iloc[0]
        X_energy.append(energy)
        y_energy.append(height)
        ids.append(id_value)

    X_energy = np.array(X_energy).reshape(-1, 1)
    y_energy = np.array(y_energy)
    ids = np.array(ids)

    #각 행동 그룹에 대해 선형 회귀 모델 학습
    num_iterations = 1000
    best_models = []

    for behavior in np.unique(behavior_labels):
        behavior_indices = np.where(behavior_labels == behavior)
        X_behavior = X_energy[behavior_indices]
        y_behavior = y_energy[behavior_indices]

        best_r2_score = -np.inf
        best_model = None

        for _ in range(num_iterations):
            # 데이터 분할
            X_train, X_test, y_train, y_test = train_test_split(X_behavior, y_behavior, test_size=0.1, random_state=np.random.randint(10000))

            # 선형 회귀 모델 학습
            model = LinearRegression()
            model.fit(X_train, y_train)

            # 예측 및 평가
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)

            if r2 > best_r2_score:
                best_r2_score = r2
                best_model = model

        best_models.append(best_model)


    def remove_outliers(X, y, model, threshold_multiplier=3.5):
        y_pred = model.predict(X)
        residuals = np.abs(y - y_pred)
        threshold = threshold_multiplier * np.std(residuals)
        cleaned_indices = residuals <= threshold
        return X[cleaned_indices], y[cleaned_indices]

    # 이상치 제거 및 2차 모델 학습
    for idx, behavior in enumerate(np.unique(behavior_labels)):
        behavior_indices = np.where(behavior_labels == behavior)
        X_behavior = X_energy[behavior_indices]
        y_behavior = y_energy[behavior_indices]

        # 이전 셀에서 학습한 최적 모델을 사용하여 이상치 제거
        best_model = best_models[idx]
        X_cleaned, y_cleaned = remove_outliers(X_behavior, y_behavior, best_model)

        best_r2_score_cleaned = -np.inf
        best_model_cleaned = None

        for _ in range(num_iterations):
            # 데이터 분할
            X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.1, random_state=np.random.randint(10000))

            # 선형 회귀 모델 학습
            model_cleaned = LinearRegression()
            model_cleaned.fit(X_train, y_train)

            # 예측 및 평가
            y_pred_cleaned = model_cleaned.predict(X_test)
            r2_cleaned = r2_score(y_test, y_pred_cleaned)

            if r2_cleaned > best_r2_score_cleaned:
                best_r2_score_cleaned = r2_cleaned
                best_model_cleaned = model_cleaned

        print(f'Best R^2 Score for Behavior {behavior} after removing outliers: {best_r2_score_cleaned:.2f}')

        # 선형 모델의 함수 출력
        coef = best_model_cleaned.coef_[0]
        intercept = best_model_cleaned.intercept_

        if idx == 0:
            a.append(coef)
        else:
            b.append(coef)
        print(f'Linear Model for Behavior {behavior} after removing outliers: y = {coef:.4f} * x + {intercept:.4f}')


    # 새로운 데이터셋을 사용하여 검증 (dataset_ver_wlk_by_id_and_trial 및 dataset_ver_jog_by_id_and_trial)
    new_processed_datasets_wlk = process_datasets(dataset_ver_wlk_by_id_and_trial)
    new_processed_datasets_jog = process_datasets(dataset_ver_jog_by_id_and_trial)
    new_selected_data_wlk = process_and_select_data(new_processed_datasets_wlk)
    new_selected_data_jog = process_and_select_data(new_processed_datasets_jog)


    # 새로운 데이터 준비
    X_new = []
    y_new = []
    ids_new = []

    for name, energy in new_selected_data_wlk.items():
        id_value = name.split('_')[2]
        height = dataset_ver_wlk_by_id_and_trial[name]['height'].iloc[0]
        X_new.append(energy)
        y_new.append(height)
        ids_new.append(id_value)

    for name, energy in new_selected_data_jog.items():
        id_value = name.split('_')[2]
        height = dataset_ver_jog_by_id_and_trial[name]['height'].iloc[0]
        X_new.append(energy)
        y_new.append(height)
        ids_new.append(id_value)

    X_new = np.array(X_new).reshape(-1, 1)
    y_new = np.array(y_new)
    ids_new = np.array(ids_new)

    # 새로운 데이터 클러스터링 예측 (주기 및 키 기반)
    new_peak_intervals = calculate_peak_intervals({**dataset_ver_wlk_by_id_and_trial, **dataset_ver_jog_by_id_and_trial})
    X_new_intervals = np.array(list(new_peak_intervals.values())).reshape(-1, 1)
    new_behaviors = logistic_model.predict(scaler.transform(X_new_intervals))

    # 새로운 데이터 행동 그룹별 선형 회귀 모델 적용 및 예측 결과 출력
    colors = {0: 'blue', 1: 'green'}
    labels = {0: 'Walking', 1: 'Jogging'}

    plt.figure(figsize=(12, 6))
    X_range = np.linspace(10, 80, 100).reshape(-1, 1)

    for behavior in np.unique(new_behaviors):
        behavior_indices = np.where(new_behaviors == behavior)
        X_behavior = X_new[behavior_indices]
        y_behavior = y_new[behavior_indices]
        best_model = best_models[behavior]

        plt.scatter(X_behavior, y_behavior, color=colors[behavior], label=f'{labels[behavior]} Data')

        # 모델의 예측 선을 그리기
        y_fit = best_model.predict(X_range)
        plt.plot(X_range, y_fit, color=colors[behavior], label=f'{labels[behavior]} Model')

    plt.xlabel('Energy')
    plt.ylabel('Height')
    plt.xlim(10, 80)
    plt.ylim(150, 190)
    plt.title('RESULT')
    plt.legend()
    plt.show()

    # 각 데이터 포인트에 대해 예측 결과 출력
    for i, (energy, true_height, behavior, id_value) in enumerate(zip(X_new, y_new, new_behaviors, ids_new)):
        best_model = best_models[behavior]
        pred_height = best_model.predict(energy.reshape(1, -1))[0]

        behavior_label = "Jogging" if behavior == 1 else "Walking"
        print(f"ID: {id_value}  |  Behavior: {behavior_label}  |  True Height: {true_height}  |  Predicted Height: {pred_height:.2f}")


def creat_time_series_seq(dt_list, act_labels, trial_codes, id, mode="mag", labeled=True):

    num_data_cols = len(dt_list) if mode == "mag" else len(dt_list*3)

    if labeled:
        dataset = np.zeros((0, num_data_cols + 7))  # "7" --> [act, code, weight, height, age, gender, trial]
    else:
        dataset = np.zeros((0, num_data_cols))

    ds_list = get_ds_infos()

    for sub_id in ds_list["code"]:
        for act_id, act in enumerate(act_labels):
            for trial in trial_codes[act_id]:
                fname = 'A_DeviceMotion_data/' + act + '_' + str(trial) + '/sub_' + str(int(sub_id)) + '.csv'
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                vals = np.zeros((len(raw_data), num_data_cols))
                for x_id, axes in enumerate(dt_list):
                    if mode == "mag":
                        vals[:, x_id] = (raw_data[axes] ** 2).sum(axis=1) ** 0.5
                    else:
                        vals[:, x_id * 3:(x_id + 1) * 3] = raw_data[axes].values
                    vals = vals[:, :num_data_cols]
                if labeled:
                    lbls = np.array([[act_id,
                                      sub_id - 1,
                                      ds_list["weight"][sub_id - 1],
                                      ds_list["height"][sub_id - 1],
                                      ds_list["age"][sub_id - 1],
                                      ds_list["gender"][sub_id - 1],
                                      trial
                                      ]] * len(raw_data))
                    vals = np.concatenate((vals, lbls), axis=1)
                dataset = np.append(dataset, vals, axis=0)
    cols = []
    for axes in dt_list:
        if mode == "raw":
            cols += axes
        else:
            cols += [str(axes[0][:-2])]

    if labeled:
        cols += ["act", "id", "weight", "height", "age", "gender", "trial"]

    dataset = pd.DataFrame(data=dataset, columns=cols)

    # id가 23인 데이터를 제외
    dataset_filtered = dataset[dataset['id'] != 23]

    # id가 주어진 id 변수와 같은 데이터를 dataset_ver에 넣고
    dataset_ver = dataset_filtered[dataset_filtered['id'] == id]

    # 나머지 데이터를 dataset에 넣음
    dataset = dataset_filtered[dataset_filtered['id'] != id]


    return dataset, dataset_ver

a = []
b = []
for seq_id in range(23):
    act_labels = ACT_LABELS [2:3]
    trial_codes = [TRIAL_CODES[act] for act in act_labels]
    dt_list = set_data_types(sdt)
    dataset_wlk, dataset_ver_wlk = creat_time_series_seq(dt_list, act_labels, trial_codes, seq_id, mode="raw", labeled=True)

    act_labels = ACT_LABELS [3:4]
    trial_codes = [TRIAL_CODES[act] for act in act_labels]
    dt_list = set_data_types(sdt)
    dataset_jog, dataset_ver_jog = creat_time_series_seq(dt_list, act_labels, trial_codes, seq_id, mode="raw", labeled=True)

    dataset_wlk_by_id = split_dataset_by_id(dataset_wlk, 'dataset_wlk')
    dataset_wlk_by_id_and_trial = split_by_trial(dataset_wlk_by_id)

    dataset_jog_by_id = split_dataset_by_id(dataset_jog, 'dataset_jog')
    dataset_jog_by_id_and_trial = split_by_trial(dataset_jog_by_id)

    dataset_ver_wlk_by_id = split_dataset_by_id(dataset_ver_wlk, 'dataset_wlk')
    dataset_ver_wlk_by_id_and_trial = split_by_trial(dataset_ver_wlk_by_id)

    dataset_ver_jog_by_id = split_dataset_by_id(dataset_ver_jog, 'dataset_jog')
    dataset_ver_jog_by_id_and_trial = split_by_trial(dataset_ver_jog_by_id)

    result(dataset_wlk_by_id_and_trial, dataset_jog_by_id_and_trial, dataset_ver_wlk_by_id_and_trial, dataset_ver_jog_by_id_and_trial, a, b)
    print('\n')



In [None]:
plt.hist(a, bins=10, label='a')
print(np.mean(a))
print(np.std(a))

In [None]:
plt.hist(b, bins=10, label='b')
print(np.mean(b))
print(np.std(b))