# TP4

## Dataset

### Dataset study and objectives

### Choosing the variables

### Dataset cleaning

In [30]:
import pandas as pd
import json
import os
import glob

# Path to the dataset folder
dos_path = './_data/dos_dataset'

# Finding the JSON files
file_paths = glob.glob(os.path.join(dos_path, 'traceJSON-*-*.json'))

# Initialising a list to stock the dataframes
dfs = []

# Browsing the JSON files
for file_path in file_paths:
    # Extracting the receiver id in the name of the file
    file_name= os.path.basename(file_path)
    receiver_id = file_name.split('-')[1]
    
    # Reading the JSON file
    with open(file_path, 'r') as f:
        data = f.readlines()
    
    # Stripping each JSON line and adding the receiver id
    json_data = [json.loads(line.strip()) for line in data]
    for line in json_data:
        line['receiver_id'] = receiver_id
    
    # Creating a pandas dataframe
    df = pd.DataFrame(json_data)
    
    # Filtering the lines where type = 3
    df_filtered = df[df['type'] == 3]
    
    # Adding the filtered dataframe to the dataframe list
    dfs.append(df_filtered)


### Further processing

In [31]:
sender_labels_path = './_data/sender_labels.csv'

df_labels = pd.read_csv(sender_labels_path, delimiter=';')

# Columns to split
columns_to_split = ['pos', 'pos_noise', 'spd', 'spd_noise', 'acl', 'acl_noise', 'hed', 'hed_noise']

for i in range(5):
    dfs[i] = pd.merge(dfs[i], df_labels, how='left', on='sender')

    # Function to split lists into sub-columns
    def split_list_to_columns(row, col_name):
        return pd.Series(row[col_name])

    # Applying the function to every columns that have list data
    for col in columns_to_split:
        new_cols = dfs[i].apply(lambda x: split_list_to_columns(x, col), axis=1)
        new_cols.columns = [f"{col}{i+1}" for i in range(len(dfs[i][col][0]))]
        dfs[i][new_cols.columns] = new_cols
    
    dfs[i] = dfs[i].drop(columns=columns_to_split)


# Printing the final dataframe

print(dfs[0].columns)


Index(['type', 'rcvTime', 'receiver_id', 'sendTime', 'sender', 'senderPseudo',
       'messageID', 'label', 'pos1', 'pos2', 'pos3', 'pos_noise1',
       'pos_noise2', 'pos_noise3', 'spd1', 'spd2', 'spd3', 'spd_noise1',
       'spd_noise2', 'spd_noise3', 'acl1', 'acl2', 'acl3', 'acl_noise1',
       'acl_noise2', 'acl_noise3', 'hed1', 'hed2', 'hed3', 'hed_noise1',
       'hed_noise2', 'hed_noise3'],
      dtype='object')


In [32]:
from sklearn.model_selection import train_test_split

# Function to create training and test splits for each dataframe
def create_splits(dfs, test_size=0.2, random_state=42):
    train_test_splits = []
    for df in dfs:
        # Split dataframe into features (X) and target variable (y)
        X = df.drop(columns=['label'])  # Replace 'target_column' with the actual name of your target column
        y = df['label']  # Replace 'target_column' with the actual name of your target column
        
        # Perform train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        
        # Store the splits in a tuple and append to the list
        train_test_splits.append((X_train, X_test, y_train, y_test))
    
    return train_test_splits

# Create training and test splits for each dataframe
train_test_splits = create_splits(dfs)

# Accessing splits for each dataframe
for i, (X_train, X_test, y_train, y_test) in enumerate(train_test_splits):
    print(f"Split {i+1}:")
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)
    print()


Split 1:
X_train shape: (492, 31)
X_test shape: (123, 31)
y_train shape: (492,)
y_test shape: (123,)

Split 2:
X_train shape: (948, 31)
X_test shape: (238, 31)
y_train shape: (948,)
y_test shape: (238,)

Split 3:
X_train shape: (952, 31)
X_test shape: (238, 31)
y_train shape: (952,)
y_test shape: (238,)

Split 4:
X_train shape: (433, 31)
X_test shape: (109, 31)
y_train shape: (433,)
y_test shape: (109,)

Split 5:
X_train shape: (305, 31)
X_test shape: (77, 31)
y_train shape: (305,)
y_test shape: (77,)



### Distribution of data in the dataset

## Federated Learning Model

### Model chosen for our task