In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


import nqDataLoader as nq #data loading library

In [24]:
# Correct file path with extension
addedscores = pd.read_csv('data_composite/scores_composite.csv')

# Set 'pID' as the index
addedscores = addedscores.set_index('pID')

# Display the first few rows
addedscores

Unnamed: 0_level_0,gt,updrs108,afTap,sTap,nqScore,typingSpeed,file_1,file_2
pID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11,True,14.25,,162.25,0.117543,189.372549,1402930351.011_001_014.csv,1403706430.011_003_014.csv
60,False,2.00,,162.25,0.070350,60.533333,1402932300.060_001_014.csv,1403708258.060_003_014.csv
67,True,25.25,,133.75,0.223411,54.333333,1401117235.067_001_014.csv,1401978395.067_003_014.csv
68,False,6.00,,159.00,0.074973,71.800000,1401114972.068_001_014.csv,1401980765.068_003_014.csv
70,True,26.25,,113.50,0.175751,39.614035,1404311419.070_001_014.csv,1404743687.070_003_014.csv
...,...,...,...,...,...,...,...,...
1063,False,0.00,110.0,170.00,-0.005529,109.800000,1463511198.1063_001_014.csv,
1064,True,28.00,75.5,140.00,0.246866,105.333333,1458723488.1064_001_014.csv,
1066,True,12.00,118.0,170.50,0.033189,140.250000,1460104760.1066_001_014.csv,
1068,True,26.00,65.5,98.00,0.125523,48.800000,1460559248.1068_001_014.csv,


In [28]:
# Concatenate the .csv files file_1 + file_2 par pID
data_folder = os.path.join("data composite", "data source composite")
output_folder = "concatenated_files"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for index, row in addedscores.iterrows():
    file_1 = os.path.join(data_folder, row['file_1'])
    file_2 = row['file_2']
    
    try:
        # Read the CSV file
        df1 = pd.read_csv(file_1)
        # Apply strip only to string (object) columns
        object_columns = df1.select_dtypes(['object']).columns
        df1[object_columns] = df1[object_columns].apply(lambda x: x.str.strip())
        
        if pd.notna(file_2):
            file_2 = os.path.join(data_folder, file_2)
            df2 = pd.read_csv(file_2)
            # Apply strip only to string (object) columns
            object_columns = df2.select_dtypes(['object']).columns
            df2[object_columns] = df2[object_columns].apply(lambda x: x.str.strip())
            
            # Concatenate dataframes
            concatenated_df = pd.concat([df1, df2], axis=0, ignore_index=True)
        else:
            concatenated_df = df1
        
        # Keep only the first four columns
        concatenated_df = concatenated_df.iloc[:, :4]

        # Save the concatenated DataFrame
        concatenated_df.to_csv(os.path.join(output_folder, f"{index}_combined.csv"), index=False)
    
    except FileNotFoundError as e:
        print(f"Error with pID {index}: {e}")


In [40]:
# In the combined.csv files, keep A to Z, 0 to 9 and space. Replace other entries by NaN
import pandas as pd
import os
import re

# Define the folders
source_folder = "concatenated_files"
output_folder = "refined_data"

# Create the output_folder if it does not exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# List all the .csv files in the source folder
csv_files = [file for file in os.listdir(source_folder) if file.endswith('.csv')]

# Define a regular expression pattern that matches the allowed entries
# This pattern looks for single characters a-z or digits 0-9, or the exact word 'space'
allowed_pattern = re.compile(r'^[a-z0-9]$|^space$')

# Function to replace unwanted entries with NaN
def clean_entry(entry):
    if pd.isna(entry):
        return entry
    str_entry = str(entry).strip().lower()  # Convert to lowercase to match the pattern
    if allowed_pattern.fullmatch(str_entry):
        return str_entry
    else:
        return "NaN"

# Process each CSV file
for csv_file in csv_files:
    input_file_path = os.path.join(source_folder, csv_file)

    output_file_name = csv_file.replace('combined', 'refined')
    output_file_path = os.path.join(output_folder, output_file_name)
    
    # Read the CSV file
    df = pd.read_csv(input_file_path)
    
    # Apply the function to clean the first column
    df.iloc[:, 0] = df.iloc[:, 0].apply(clean_entry)
    
    # Save the modified DataFrame to the new folder
    df.to_csv(output_file_path, index=False)

print("All files have been processed and saved in the 'refined_data' folder.")


All files have been processed and saved in the 'refined_data' folder.


In [43]:
# remove lines before the first 'space'
import pandas as pd
import os

# Define the folders
source_folder = "refined_data"  # The folder with the original refined data
destination_folder = "refined2_data"  # The folder for the second round of refined data

# Create the destination_folder if it does not exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# List all the .csv files in the source_folder
csv_files = [file for file in os.listdir(source_folder) if file.endswith('.csv')]

# Process each CSV file
for csv_file in csv_files:
    input_file_path = os.path.join(source_folder, csv_file)
    output_file_name = csv_file.replace('refined', 'refined2')
    output_file_path = os.path.join(destination_folder, output_file_name)
    
    # Read the CSV file without headers
    df = pd.read_csv(input_file_path, header=None)
    
    # Find the first occurrence of "space" in the first column
    first_space_index = df[df.iloc[:, 0] == 'space'].index.min()
    
    # Remove all rows above the first "space" entry
    df = df.loc[first_space_index:].reset_index(drop=True)
    
    # Save the modified DataFrame to the new folder
    df.to_csv(output_file_path, index=False, header=False)

print("All files have been processed and saved in the 'refined2_data' folder.")


All files have been processed and saved in the 'refined2_data' folder.
