In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [4]:
class TrainSatisfactionSimulator:
    def __init__(self, n_customers : int):
        self.n_customers = n_customers
        self.satisfaction = np.zeros(n_customers)
        self.price = None
        self.punctuality = None
        self.duration = None
        self.frequency = None
        self.overcrowding = None
        self.data = self.generate_data(n_customers)
    
    def generate_independent_vars(self, n_customers : int):
        prices = np.arange(2.20, 100, 0.05)
        p_prices = np.random.normal(25, 0.25,size=len(prices))
        p_prices /= np.sum(p_prices)
        self.price = np.random.choice(prices, p=p_prices, size=n_customers)
        
        self.punctuality = np.random.choice([1, 2, 3, 4, 5], p=[0.03, 0.07, 0.1, 0.3, 0.5], size=n_customers) # 5 globalement très ponctuel, 1 globalement très en retard
        
        duration_table = np.arange(2.20, 100, 0.05)
        len_dur = len(duration_table)
        p_duration = np.random.normal(4.8, 0.25, size=len_dur) # moyenne par jour
        p_duration /= np.sum(p_duration)
        self.duration = np.random.choice(duration_table, p=p_duration, size=n_customers)
        
        freq_table = np.arange(1, 260)
        len_freq = len(freq_table)
        p_freq = np.random.normal(70, 0.25, size=len_freq)
        p_freq /= np.sum(p_freq)
        self.frequency = np.random.choice(freq_table, p=p_freq, size=n_customers) # par an (entre 1 et 260)
        
        self.overcrowding = np.random.choice([1, 2, 3, 4, 5], size=n_customers)
        
    def generate_dependent_var(self, n_customers : int):
        pass
    
    def generate_data(self, n_customers):
        self.generate_independent_vars(n_customers)
        self.generate_dependent_var(n_customers)
        return np.array([self.price, self.punctuality, self.duration, 
                             self.frequency, self.overcrowding, self.satisfaction]).T

      

### **sources d'inspiration** :
- Durée : https://www.bfs.admin.ch/bfs/fr/home/statistiques/mobilite-transports/transport-personnes/comportements-transports/distance-duree-jour-deplacements.html
- Fréquence : https://www.bav.admin.ch/bav/fr/home/modes-de-transport/chemin-de-fer/transport-des-voyageurs.html
- Propreté :https://www.bav.admin.ch/bav/fr/home/themes-generaux/trv/systeme-de-mesure-de-la-qualite.html


In [5]:
class IndependentSatisfaction(TrainSatisfactionSimulator):
    def __init__(self, n_customers):
        super().__init__(n_customers)
    def generate_dependent_var(self, n_customers):
        for i in range (n_customers):
            d = np.random.choice(2)
            self.satisfaction[i] = d

train_data = IndependentSatisfaction(1000)
train_data.data

array([[ 99.6 ,   5.  ,  43.6 ,  52.  ,   3.  ,   0.  ],
       [ 28.6 ,   5.  ,  74.8 ,  93.  ,   4.  ,   1.  ],
       [ 42.15,   5.  ,  51.55, 124.  ,   2.  ,   0.  ],
       ...,
       [ 81.5 ,   4.  ,  46.25,  49.  ,   1.  ,   1.  ],
       [ 76.6 ,   5.  ,   5.95, 151.  ,   5.  ,   1.  ],
       [ 99.1 ,   5.  ,  44.1 , 179.  ,   1.  ,   1.  ]])

In [6]:
class SimpleDependentSatisfaction(TrainSatisfactionSimulator):
    def __init__(self, n_customers):
        super().__init__(n_customers)
        
    def generate_dependent_var(self, n_customers):
        """ 
            prix influence negative 0.2
            durée influence negative 0.1
            fréquence influence negative 0.2
            ponctualité influence positive 0.3
            overcrowding inlfuence negative 0.2
            
        """
        prices = self.price
        for i in range(n_customers):
            if (prices[i] > 50):
                self.satisfaction[i] = 1
            else:
                self.satisfaction[i] = 0
    
train_data = SimpleDependentSatisfaction(1000)
train_data.data

array([[ 91.55,   4.  ,  51.05, 207.  ,   4.  ,   1.  ],
       [ 76.75,   3.  ,  36.45,  46.  ,   4.  ,   1.  ],
       [ 18.95,   5.  ,  89.15, 220.  ,   5.  ,   0.  ],
       ...,
       [ 46.1 ,   3.  ,  53.2 ,  80.  ,   4.  ,   0.  ],
       [ 70.65,   4.  ,  45.7 ,  63.  ,   2.  ,   1.  ],
       [ 58.65,   4.  ,  41.85, 107.  ,   4.  ,   1.  ]])

In [7]:
class ComplexDependentSatisfaction(TrainSatisfactionSimulator):
    def __init__(self, n_customers):
        super().__init__(n_customers)
        
    def generate_dependent_var(self, n_customers):

        data = np.array([self.price, self.punctuality, self.duration, 
                             self.frequency, self.overcrowding]).T
        scaler = MinMaxScaler()
        data = scaler.fit_transform(data)
        prices = data[:,0]
        punctuality = data[:,1]
        duration = data[:,2]
        frequency = data[:,3]
        overcrowding = data[:,4]
        
        i_price = 0.2
        i_dur = 0.1
        i_freq = 0.2
        i_punct = 0.3
        i_overcrow = 0.2
        for i in range(n_customers):
            score = 1 + ((i_price * -prices[i]) +  (i_dur * -duration[i])
            + (i_freq * -frequency[i]) + (i_punct * punctuality[i]) 
            + (i_overcrow * -overcrowding[i]))
            p = 1 / (1 + np.exp(-score))
            d = np.random.choice(2, p = [1 - p, p])
            self.satisfaction[i] = d
    

train_data = ComplexDependentSatisfaction(1000)
train_data.data

array([[ 72.85,   5.  ,  64.05, 140.  ,   2.  ,   1.  ],
       [ 74.7 ,   5.  ,  95.8 , 152.  ,   3.  ,   1.  ],
       [ 46.8 ,   4.  ,  29.8 , 242.  ,   2.  ,   1.  ],
       ...,
       [ 91.6 ,   5.  ,  70.5 , 204.  ,   5.  ,   1.  ],
       [ 34.4 ,   5.  ,  39.25,  49.  ,   2.  ,   0.  ],
       [  5.15,   4.  ,  86.3 , 147.  ,   5.  ,   1.  ]])

In [11]:
class PondDependentSatisfaction(TrainSatisfactionSimulator):
    def __init__(self, n_customers):
        super().__init__(n_customers)
        
    def generate_dependent_var(self, n_customers):
        """
        Calcule la satisfaction binaire (1 ou 0) en fonction des variables indépendantes
        pondérées par leur facteur d'importance.
        """
        # Facteurs d'importance
        i_price = 0.2
        i_punctuality = 0.3
        i_duration = 0.1
        i_frequency = 0.2
        i_overcrowding = 0.2

        # Crée un tableau de données et applique une normalisation
        data = np.array([self.price, self.punctuality, self.duration, 
                         self.frequency, self.overcrowding]).T
        scaler = MinMaxScaler()
        data = scaler.fit_transform(data)  # Met à l'échelle entre [0, 1]

        # Applique les facteurs d'importance pour chaque variable
        weighted_sum = (i_price * data[:, 0] +
                        i_punctuality * data[:, 1] +
                        i_duration * data[:, 2] +
                        i_frequency * data[:, 3] +
                        i_overcrowding * data[:, 4])

        # Calcule la satisfaction : 0 si la somme pondérée > 0.5, sinon 1
        self.satisfaction = np.where(weighted_sum > 0.5, 0, 1)  # Utilise un seuil de 0.5 après normalisation
        

train_data = PondDependentSatisfaction(10)
train_data.data

array([[ 25.15,   4.  ,  80.9 ,  97.  ,   3.  ,   1.  ],
       [ 72.65,   5.  ,  39.6 ,  55.  ,   1.  ,   0.  ],
       [ 97.5 ,   4.  ,  53.85,  91.  ,   5.  ,   0.  ],
       [ 77.4 ,   4.  ,  66.25,  25.  ,   3.  ,   0.  ],
       [ 87.4 ,   4.  ,  71.85,  26.  ,   4.  ,   0.  ],
       [ 87.95,   2.  ,  33.4 ,  46.  ,   4.  ,   1.  ],
       [ 66.25,   2.  ,   5.6 ,  46.  ,   3.  ,   1.  ],
       [ 36.05,   5.  ,  78.5 , 211.  ,   4.  ,   0.  ],
       [ 36.1 ,   3.  ,   4.1 ,  60.  ,   5.  ,   1.  ],
       [ 21.5 ,   4.  ,  87.45,  73.  ,   4.  ,   0.  ]])

## Alternative model

In [None]:
class TrainSatisfactionSimulator:
    def __init__(self, n_customers):
        self.n_customers = n_customers
        self.price = None
        self.punctuality = None
        self.duration = None
        self.frequency = None
        self.overcrowding = None
        self.satisfaction = np.zeros(n_customers)  # Satisfaction binaire (1 ou 0)
        self.data_matrix = None  # Matrice pour stocker les données complètes
        
        self.generate_independent_vars()
        self.generate_satisfaction()
        self.create_data_matrix()

    def generate_independent_vars(self):
        """
        Génère les valeurs pour price, punctuality, duration, frequency et overcrowding.
        Chaque variable suit une distribution normale entre 1 et 5.
        """
        mean, std_dev = 3, 1  # Moyenne centrée sur 3 pour rester dans [1,5]

        # Génération des variables indépendantes en suivant une distribution normale
        self.price = np.clip(np.round(np.random.normal(mean, std_dev, self.n_customers)), 1, 5)
        self.punctuality = np.clip(np.round(np.random.normal(mean, std_dev, self.n_customers)), 1, 5)
        self.duration = np.clip(np.round(np.random.normal(mean, std_dev, self.n_customers)), 1, 5)
        self.frequency = np.clip(np.round(np.random.normal(mean, std_dev, self.n_customers)), 1, 5)
        self.overcrowding = np.clip(np.round(np.random.normal(mean, std_dev, self.n_customers)), 1, 5)

    def generate_satisfaction(self):
        """
        Calcule la satisfaction binaire (1 ou 0) en fonction des variables indépendantes
        pondérées par leur facteur d'importance.
        """
        # Facteurs d'importance
        i_price = 0.2
        i_punctuality = 0.3
        i_duration = 0.1
        i_frequency = 0.2
        i_overcrowding = 0.2

        # Crée un tableau de données et applique une normalisation
        data = np.array([self.price, self.punctuality, self.duration, 
                         self.frequency, self.overcrowding]).T
        scaler = MinMaxScaler()
        data = scaler.fit_transform(data)  # Met à l'échelle entre [0, 1]

        # Applique les facteurs d'importance pour chaque variable
        weighted_sum = (i_price * data[:, 0] +
                        i_punctuality * data[:, 1] +
                        i_duration * data[:, 2] +
                        i_frequency * data[:, 3] +
                        i_overcrowding * data[:, 4])

        # Calcule la satisfaction : 0 si la somme pondérée > 0.5, sinon 1
        self.satisfaction = np.where(weighted_sum > 0.5, 0, 1)  # Utilise un seuil de 0.5 après normalisation

    def create_data_matrix(self):
        """
        Crée une matrice de données combinant toutes les variables indépendantes et la satisfaction.
        """
        # Combine les variables et la satisfaction dans une matrice
        self.data_matrix = np.column_stack((self.price, self.punctuality, self.duration,
                                            self.frequency, self.overcrowding, self.satisfaction))
    
    def display_data_matrix(self):
        """
        Affiche la matrice des données pour chaque client.
        """
        print("Data Matrix (price, punctuality, duration, frequency, overcrowding, satisfaction):")
        print(self.data_matrix)