In [1]:
import math
from decimal import *
import os
import re
import shutil
import sys
import warnings
import time

import chardet
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from statistics import median

from constants import bins_axis_names, bin_dimensions_regex, boat_speed_feature, \
                    boxplot_axis_name, feature_regex, identifier_features, \
                    fiber_optics_structure_features, fiber_optics_appendix_features, \
                    other_sensor_features, statistics_features, wind_features

In [2]:
def read_csv(fnames):
    """Read CSV file(s) to a Pandas Dataframe"""
    def detect_and_read(fname):
        with open(fname, 'rb') as f:
            return pd.read_csv(fname, sep=';', encoding=chardet.detect(f.read())['encoding'])
    df = pd.concat(map(lambda fname: detect_and_read(fname), fnames), sort=False)
    return df

def preprocess_data(df,
                identifier_cols=identifier_features,
                cols_to_preprocess=fiber_optics_structure_features + fiber_optics_appendix_features,
                other_cols=other_sensor_features + wind_features + [boat_speed_feature],
                regex=feature_regex):
    preprocessed_df = pd.DataFrame()
    cols = identifier_cols + cols_to_preprocess + other_cols
    exclude = identifier_cols + other_cols
    for old_col, new_col in cols: 
        if (old_col, new_col) in exclude: 
            preprocessed_df[new_col] = df[old_col] 
        else:
            filter_regex = regex.format(old_col) 
            preprocessed_df[new_col] = df.filter(regex=(filter_regex)).mean(axis=1)
    return preprocessed_df

In [3]:
def pol_speed(wind_speed, wind_angle):
    
    pol = pd.read_csv('Polaire.csv',sep=';', )
    
    wind_angle = abs(wind_angle)
    
    if wind_angle < 40:
        wind_angle = 40
    if wind_angle > 160 :
        wind_angle = 160
    
    if wind_speed < 2:
        wind_speed = 2
    if wind_speed > 50:
        wind_speed = 50
    
    for ws in pol:
        if ws == "TWA":
            1
        else:
            if int(ws) <= wind_speed:
                ws_under = int(ws)
            if int(ws) >= wind_speed:
                ws_upper = int(ws)
                break        
    for wd in pol['TWA']:
        if wd <= wind_angle:
            wd_under = wd
        if wd >= wind_angle:
            wd_upper = wd
            break
                        
    if wd_under == wd_upper and ws_under == ws_upper:
        X = pol[pol['TWA'] == wind_angle][str(wind_speed)]
    
    if wd_under != wd_upper and ws_under != ws_upper:
        
        x1 = pol[pol['TWA'] == wd_under][str(ws_under)]
        x2 = pol[pol['TWA'] == wd_under][str(ws_upper)]
        x3 = pol[pol['TWA'] == wd_upper][str(ws_under)]
        x4 = pol[pol['TWA'] == wd_upper][str(ws_upper)]
        

        a = (x2-x1)/(ws_upper-ws_under)
        b = x1-a*ws_under
        x5 = a*wind_speed + b

        a_2 = (x4-x3)/(ws_upper-ws_under)
        b_2 = x3-a_2*ws_under
        x5_2 = a_2*wind_speed + b_2

        A = (float(x5_2)-float(x5))/(wd_upper-wd_under)
        B = x5-A*wd_under
        X = A*wind_angle + B

            
    if wd_under == wd_upper and ws_under != ws_upper:
        
        x1 = pol[pol['TWA'] == wind_angle][str(ws_under)]
        x2 = pol[pol['TWA'] == wind_angle][str(ws_upper)]
        a = (x2-x1)/(ws_upper-ws_under)
        b = x1-a*ws_under
        X = a*wind_speed + b 
            
    if wd_under != wd_upper and ws_under == ws_upper:
        
        x3 = pol[pol['TWA'] == wd_under][str(wind_speed)]
        x4 = pol[pol['TWA'] == wd_upper][str(wind_speed)]  
        a_2 = (float(x4)-float(x3))/(wd_upper-wd_under)
        b_2 = x3-a_2*wd_under
        X = a_2*wind_angle + b_2


    return(X)

In [4]:
# Ignore RunTimeWarning.
warnings.filterwarnings('ignore')

# Ignore PyPlot warning.
plt.rcParams.update({'figure.max_open_warning': 0})

# Version number.
version = 'v2'

In [None]:
#Prend en argument un csv et renvoie un dataframe avec uniquement les données utilisables pour analyser le comportement du bateau
def clean_data():
        
    #Transformation du fichier csv en dataframe pandas
    df = read_csv(map(lambda fname: '{}.csv'.format(fname), ['59', '111', '132', '308', '516', '525']))  
    
    #On renomme les colonnes
    df = preprocess_data(df)
    
    #On supprime les doublons
    #df = df.drop_duplicates()
    
    ###############################Suppression des lignes inexploitables#############################################
    
    #Pour avoir le pourcentage de données supprimées
    l1 = len(df)
    
    #On initialise la liste des lignes que l'on va supprimer
    index_to_be_deleted = []
    
    #On y ajoute les données pour lesquelles la GV n'est pas montée et 5 minutes après
    #no_Mainsail = df[df['Mainsail_Full'] == 0].index
    #for i in no_Mainsail:
    #    index_to_be_deleted.extend(np.arange(i, i+300))

    #Indices pour lesquels le bateau est face au vent
    #ind1 = df[(df["Wind_Angle"] < 40) & (df["Wind_Angle"] > -40)].index
    
    #Indices pour lesquels le bateau est en vent arrière
    #ind2 = df[((df["Wind_Angle"] < -170) & (df["Wind_Angle"] > -181)) | ((df["Wind_Angle"] > 170) & (df["Wind_Angle"] < 181))].index
    

    
    #On supprime les données 10 minutes avant et après les manoeuvres
    #for i1 in ind1:
    #    index_to_be_deleted.extend(np.arange(i1-600, i1+600))

    #for i2 in ind2:
    #    index_to_be_deleted.extend(np.arange(i2-600, i2+600))
        
    
    #Indices pour lesquels le bateau avance à une vitesse inférieure à 75% de la polaire
    ind3 = []
    for i in df.index:
        if math.isnan(df['Wind_Angle'][i]) == False and df['Boat_Speed'][i] < 0.75*float(pol_speed(float(df['Wind_Speed'][i]), float(df['Wind_Angle'][i]))):
            ind3.append(i)
            
    index_to_be_deleted += ind3
            
    
    #On supprime les indices qui apparaissent plusieurs fois
    final_list = list(set(index_to_be_deleted))
    

    
    #On supprime les lignes non pertinentes pour l'analyse du comportement du bateau
    df = df.drop(final_list, errors = 'ignore')

    
    #On réinitialise les indices
    df = df.reset_index(drop = True)

    #Pour avoir le pourcentage de données supprimées
    l2 = len(final_list)
    
    #On print le pourcentage de données (lignes) supprimées
    print(Decimal(l2/l1*100).quantize(Decimal('.1'), rounding=ROUND_HALF_UP),'% des données ont été ignorées')
    
    #####################################################################################################################
    
    ###############################Suppression des colonnes inexploitables#############################################

    
    del_columns = []

    #On supprime les colonnes avec des Nan  (-de 50 données exploitables)
    for i in df:
        if df[i].count() < 50 :
            del_columns.append(i)
            df = df.drop(i, 1)
    print('Variables avec trop de Nan :', del_columns)
    
    #On supprime les colonnes qui ne contiennent que des "inf" ou des "-inf" (-de 50 données exploitables)
    col_inf = []
    for i in df.iloc[:,2:]:
        if str(df[i].mean()) == '-inf' or str(df[i].mean()) == 'inf' :
            df = df.drop(i, 1)
            col_inf.append(i)
    print('Variables avec trop de "inf" ou "-inf" :', col_inf)
    
    #On supprime les colonnes qui ne contiennent que des 0 (-de 50 données exploitables)
    col_with_0 = []
    for i in df.iloc[:,2:]:
        count = 0
        for j in df[i]:
            if j != 0.0:
                count+=1
        if count < 50:
            df = df.drop(i, 1)
            col_with_0.append(i)
    print('Variables avec trop de 0 :', col_with_0)
    
    print(len(list(df)), "variables sur", len(list(df))+len(del_columns)+len(col_inf)+len(col_with_0), "sont exploitables")

    ########################################################################################################################
    
    
    #On calcule et ajoute les variables qui n'ont de sens que sur un bord, et on supprime les deux utilisées
    
    #Création de variables

    #Variables amures
    #Tack
    df['Tack'] = np.where(df['Apparent_Wind_Angle']<0,'Port','Starboard')
    #Foil
    df['Foil'] = np.where(df['Apparent_Wind_Angle']<0,df['Starboard_Foil_Max_Deformation_in_4'],df['Port_Foil_Max_Deformation_in_4'])
    #Foil_rake
    df['Foil_Rake'] = np.where(df['Apparent_Wind_Angle']<0,df['Starboard_Foil_Max_Deformation_in_4'],df['Port_Foil_Max_Deformation_in_4'])
    #Rudder Angle
    df['Rudder_Angle'] = np.where(df['Apparent_Wind_Angle']<0,df['Rudder_Angle_Starboard'],df['Rudder_Angle_Port'])
    #Rudder_elevator
    df['Rudder_Elevator'] = np.where(df['Apparent_Wind_Angle']<0,df['Rudder_Elevator_Angle_Starboard'],df['Rudder_Elevator_Angle_Port'])
    #Rudder_Load_I
    df['Rudder_Load_I'] = np.where(df['Apparent_Wind_Angle']<0,df['Rudder_Inside_Load_Starboard'],df['Rudder_Inside_Load_Port'])
    #Rudder_Load_o
    df['Rudder_Load_o'] = np.where(df['Apparent_Wind_Angle']<0,df['Rudder_Outside_Load_Starboard'],df['Rudder_Outside_Load_Port'])
    #Shroud
    df['Shroud'] = np.where(df['Apparent_Wind_Angle']<0,df['Shroud_Load_Starboard'],df['Shroud_Load_Port'])

    #Variables portance
    #Front_rake
    df['Front_Rake'] = df['Foil_Rake'] + df['Board_Elevator_Angle_Center']
    #Aft_rake
    df['Aft_Rake'] = df['Rudder_Elevator'] + df['Rudder_Elevator_Angle_Center']
    #Leeward_rake
    df['Leeward_Rake'] = df['Foil_Rake'] + df['Rudder_Elevator']
    #Center_rake
    df['Center_rake'] = df['Rudder_Elevator_Angle_Center'] + df['Rudder_Elevator']
    
    #On supprime les colonnes qui ne sont plus utiles
    
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_in_1', 'Port_Foil_Max_Deformation_in_1' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_in_2', 'Port_Foil_Max_Deformation_in_2' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_in_3', 'Port_Foil_Max_Deformation_in_3' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_in_4', 'Port_Foil_Max_Deformation_in_4' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_in_5', 'Port_Foil_Max_Deformation_in_5' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_in_6', 'Port_Foil_Max_Deformation_in_6' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_in_7', 'Port_Foil_Max_Deformation_in_7' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_in_8', 'Port_Foil_Max_Deformation_in_8' ], errors = 'ignore')
    
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_in_1', 'Port_Foil_Min_Deformation_in_1' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_in_2', 'Port_Foil_Min_Deformation_in_2' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_in_3', 'Port_Foil_Min_Deformation_in_3' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_in_4', 'Port_Foil_Min_Deformation_in_4' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_in_5', 'Port_Foil_Min_Deformation_in_5' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_in_6', 'Port_Foil_Min_Deformation_in_6' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_in_7', 'Port_Foil_Min_Deformation_in_7' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_in_8', 'Port_Foil_Min_Deformation_in_8' ], errors = 'ignore')
    
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_out_1', 'Port_Foil_Max_Deformation_out_1' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_out_2', 'Port_Foil_Max_Deformation_out_2' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_out_3', 'Port_Foil_Max_Deformation_out_3' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_out_4', 'Port_Foil_Max_Deformation_out_4' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_out_5', 'Port_Foil_Max_Deformation_out_5' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_out_6', 'Port_Foil_Max_Deformation_out_6' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_out_7', 'Port_Foil_Max_Deformation_out_7' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Max_Deformation_out_8', 'Port_Foil_Max_Deformation_out_8' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_out_1', 'Port_Foil_Min_Deformation_out_1' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_out_2', 'Port_Foil_Min_Deformation_out_2' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_out_3', 'Port_Foil_Min_Deformation_out_3' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_out_4', 'Port_Foil_Min_Deformation_out_4' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_out_5', 'Port_Foil_Min_Deformation_out_5' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_out_6', 'Port_Foil_Min_Deformation_out_6' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_out_7', 'Port_Foil_Min_Deformation_out_7' ], errors = 'ignore')
    df = df.drop(columns=['Starboard_Foil_Min_Deformation_out_8', 'Port_Foil_Min_Deformation_out_8' ], errors = 'ignore')
    
    df = df.drop(columns=['Rudder_Angle_Starboard', 'Rudder_Angle_Port' ])
    df = df.drop(columns=['Rudder_Elevator_Angle_Starboard', 'Rudder_Elevator_Angle_Port' ])
    df = df.drop(columns=['Rudder_Inside_Load_Starboard', 'Rudder_Inside_Load_Port' ])
    df = df.drop(columns=['Rudder_Outside_Load_Starboard', 'Rudder_Outside_Load_Port' ])
    df = df.drop(columns=['Shroud_Load_Starboard', 'Shroud_Load_Port' ])
    
    
    
    
    return (df)

In [None]:
df = clean_data()

In [None]:
df.to_csv('Final_Data.csv', sep=';')