In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import pairwise_distances
import time 
from sklearn.metrics import classification_report
# from imblearn.over_sampling import RandomOverSampler

In [2]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('TkAgg')  # Replace 'TkAgg' with the appropriate backend name if needed
import seaborn as sns


# Charger le dataset Titanic
df = pd.read_csv('train.txt')

# Encoder les variables catégorielles
label_encoder = LabelEncoder()
categorical_vars = ['Embarked', 'Cabin']
for var in categorical_vars:
    df[var] = label_encoder.fit_transform(df[var].astype(str))

# Créer une copie du DataFrame pour la manipulation
df_imputed = df.copy()

# Variables à imputer
vars_with_missing_values = ['Age', 'Embarked', 'Cabin']

# Imputer les valeurs manquantes avec KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_imputed[vars_with_missing_values] = imputer.fit_transform(df_imputed[vars_with_missing_values])

# Variables incomplètes après imputation
incomplete_vars = ['Embarked', 'Cabin']

# Créer un modèle prédictif pour chaque variable incomplète
for var in incomplete_vars:
    complete_vars = [x for x in incomplete_vars if x != var]
    complete_data = df_imputed.loc[df_imputed[var].notnull(), complete_vars]
    incomplete_data = df_imputed.loc[df_imputed[var].isnull(), complete_vars]

    if incomplete_data.shape[0] > 0:
        target = df_imputed.loc[df_imputed[var].notnull(), var]
        model = HistGradientBoostingRegressor()
        model.fit(complete_data, target)
        predicted_values = model.predict(incomplete_data)
        df_imputed.loc[df_imputed[var].isnull(), var] = predicted_values

# Vérification des modifications
missing_values_after = df_imputed.isnull().sum()
print(missing_values_after)

# Configuration des options d'affichage pour afficher toutes les lignes et toutes les colonnes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # Définir la largeur d'affichage sur None pour éviter la troncature des colonnes

# Afficher le dataset après élimination des valeurs manquantes
print(df_imputed)
df_imputed.to_csv('train_imputed.csv', index=False)




'''Partie de elimination des valeures aberantes '''
# Chargement des données Titanic
data = pd.read_csv('train_imputed.csv')



# Détection et traitement des valeurs aberrantes pour la caractéristique "Age"
feature = 'Age'

# Visualisation améliorée avec un boxplot et un histogramme
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.boxplot(data[feature], ax=axes[0])
axes[0].set_ylabel(feature)
sns.histplot(data[feature], kde=True, ax=axes[1])
axes[1].set_xlabel(feature)
plt.show()

# Détection des valeurs aberrantes
Q1 = data[feature].quantile(0.25)
Q3 = data[feature].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR

outliers = data[(data[feature] > upper_bound) | (data[feature] < lower_bound)]
print(f"Valeurs aberrantes pour la caractéristique '{feature}':")
print(outliers)

# Traitement des valeurs aberrantes (remplacement par la médiane)
median_age = data[feature].median()
data.loc[outliers.index, feature] = median_age

# Affichage des lignes avec les valeurs aberrantes remplacées
print("Lignes avec les valeurs aberrantes remplacées:")
print(data.loc[outliers.index])

# Affichage du jeu de données après traitement des valeurs aberrantes
print("Jeu de données après traitement des valeurs aberrantes:")
data.to_csv('resultat.csv', index=False)



PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Duplicate      0
dtype: int64
    PassengerId  Survived  Pclass  \
0             1         0       3   
1             2         1       1   
2             3         1       3   
3             4         1       1   
4             4         1       1   
5             5         0       3   
6             6         0       3   
7             7         0       1   
8             7         0       1   
9             8         0       3   
10            9         1       3   
11           10         1       2   
12           11         1       3   
13           12         1       1   
14           13         0       3   
15           14         0       3   
16           15         0       3   
17           16         1       2   
18           16         1       2   
19           17         0 

In [5]:
# Split the dataset into features and target variable
mydata = pd.read_csv("resultat.csv")
mydata
# oversampler = RandomOverSampler(random_state=42)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Duplicate
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,5.0,2.0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence BriggsTha...",female,38.0,1,0,PC 17599,71.2833,2.0,0.0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,5.0,2.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,1.0,2.0,1
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,1.0,2.0,1
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,5.0,2.0,0
6,6,0,3,"Moran, Mr. James",male,29.2,0,0,330877,8.4583,5.0,1.0,0
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,3.0,2.0,1
8,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,3.0,2.0,1
9,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,5.0,2.0,0


In [6]:
#elimination du colonne 
mydata = mydata.drop('Cabin', axis=1)

In [7]:
mydata.to_csv('train_svm.csv', index=False)

In [8]:
age_median = mydata['Age'].median()
mydata['Age'].fillna(age_median, inplace=True)
mydata.to_csv('train_svm.csv', index=False)

In [9]:
# Find the most frequent value in the "Embarked" column
most_frequent_embarked = mydata['Embarked'].mode()[0]
# Fill the missing values with the most frequent value
mydata['Embarked'].fillna(most_frequent_embarked, inplace=True)
# Save the modified dataset
mydata.to_csv('train_b.csv', index=False)

In [10]:
# Split the dataset into features and target variable
X = mydata.drop(columns=['Duplicate'])  # Features
y = mydata['Duplicate']  # Target variable
pd.set_option('display.max_rows', None)
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 25 entries, 0 to 24
Series name: Duplicate
Non-Null Count  Dtype
--------------  -----
25 non-null     int64
dtypes: int64(1)
memory usage: 332.0 bytes


In [11]:
# X, y = oversampler.fit_resample(X, y)

In [12]:
# Extraction des caractéristiques pertinentes
features = X[["Name","Sex","Age", "Ticket"]].copy()
features["Sex"] = features["Sex"].map({"male": 0, "female": 1})  # Encodage binaire pour la variable catégorielle "Sex"

In [13]:
# Traitement du langage naturel pour la variable "Name"
name_features = X["Name"].copy()
name_features.fillna("", inplace=True)  # Remplacer les valeurs manquantes par une chaîne vide

# Tokenisation des noms
name_features = name_features.apply(lambda x: x.split())

# Représentation TF-IDF des noms
tfidf = TfidfVectorizer()
name_encoded = tfidf.fit_transform(name_features.apply(lambda x: " ".join(x)))
name_columns = ["Name_" + feature for feature in tfidf.get_feature_names_out()]
name_features = pd.DataFrame(name_encoded.toarray(), columns=name_columns)

# Concaténer les caractéristiques avec les autres caractéristiques
features = pd.concat([features, name_features], axis=1)


In [14]:
ticket_features = X["Ticket"].copy()
ticket_features.fillna("", inplace=True)
ticket_features = ticket_features.apply(lambda x: x.split())
ticket_encoded = tfidf.transform(ticket_features.apply(lambda x: " ".join(x)))
ticket_columns = ["Ticket_" + feature for feature in tfidf.get_feature_names_out()]
ticket_features = pd.DataFrame(ticket_encoded.toarray(), columns=ticket_columns)
features = pd.concat([features, ticket_features], axis=1)
features.drop("Ticket", axis=1, inplace=True)

In [15]:
features

Unnamed: 0,Name,Sex,Age,Name_achem,Name_adele,Name_adolfina,Name_allen,Name_amanda,Name_anders,Name_andersson,Name_asplund,Name_augusta,Name_berg,Name_bonnell,Name_bradley,Name_braund,Name_briggsthayer,Name_carl,Name_charles,Name_cumings,Name_elisabeth,Name_elizabeth,Name_emelia,Name_emilia,Name_eugene,Name_fatima,Name_florence,Name_futrelle,Name_gosta,Name_harris,Name_heath,Name_heikkinen,Name_henry,Name_hewlett,Name_hulda,Name_jacques,Name_james,Name_johan,Name_johansson,Name_john,Name_johnson,Name_julius,Name_kingcome,Name_laina,Name_leonard,Name_lily,Name_marguerite,Name_maria,Name_mary,Name_masselmani,Name_master,Name_may,Name_mccarthy,Name_miss,Name_moran,Name_mr,Name_mrs,Name_nasser,Name_nicholas,Name_oscar,Name_owen,Name_palsson,Name_peel,Name_planke,Name_rice,Name_rut,Name_sandstrom,Name_saundercock,Name_selma,Name_timothy,Name_vandemoortele,Name_vander,Name_vestrom,Name_vilhelmina,Name_william,Name_williams,Ticket_achem,Ticket_adele,Ticket_adolfina,Ticket_allen,Ticket_amanda,Ticket_anders,Ticket_andersson,Ticket_asplund,Ticket_augusta,Ticket_berg,Ticket_bonnell,Ticket_bradley,Ticket_braund,Ticket_briggsthayer,Ticket_carl,Ticket_charles,Ticket_cumings,Ticket_elisabeth,Ticket_elizabeth,Ticket_emelia,Ticket_emilia,Ticket_eugene,Ticket_fatima,Ticket_florence,Ticket_futrelle,Ticket_gosta,Ticket_harris,Ticket_heath,Ticket_heikkinen,Ticket_henry,Ticket_hewlett,Ticket_hulda,Ticket_jacques,Ticket_james,Ticket_johan,Ticket_johansson,Ticket_john,Ticket_johnson,Ticket_julius,Ticket_kingcome,Ticket_laina,Ticket_leonard,Ticket_lily,Ticket_marguerite,Ticket_maria,Ticket_mary,Ticket_masselmani,Ticket_master,Ticket_may,Ticket_mccarthy,Ticket_miss,Ticket_moran,Ticket_mr,Ticket_mrs,Ticket_nasser,Ticket_nicholas,Ticket_oscar,Ticket_owen,Ticket_palsson,Ticket_peel,Ticket_planke,Ticket_rice,Ticket_rut,Ticket_sandstrom,Ticket_saundercock,Ticket_selma,Ticket_timothy,Ticket_vandemoortele,Ticket_vander,Ticket_vestrom,Ticket_vilhelmina,Ticket_william,Ticket_williams
0,"Braund, Mr. Owen Harris",0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.547652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.547652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316594,0.0,0.0,0.0,0.0,0.547652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Cumings, Mrs. John Bradley (Florence BriggsTha...",1,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.436544,0.0,0.436544,0.0,0.0,0.436544,0.0,0.0,0.0,0.0,0.0,0.0,0.436544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.436544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Heikkinen, Miss. Laina",1,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.223333,0.0,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.223333,0.0,0.0,0.0,0.0,0.0,0.397937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Allen, Mr. William Henry",0,35.0,0.0,0.0,0.0,0.586703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.519973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.519973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Moran, Mr. James",0,29.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.654533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.654533,0.378381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"McCarthy, Mr. Timothy J",0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642099,0.0,0.0,0.418829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"McCarthy, Mr. Timothy J",0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642099,0.0,0.0,0.418829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Palsson, Master. Gosta Leonard",0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513973,0.0,0.0,0.0,0.0,0.0,0.455516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
features.drop("Name", axis=1, inplace=True)
# Create an instance of MinMaxScaler
min_max_scaler = MinMaxScaler()

# Perform normalization on the feature matrix X
features = min_max_scaler.fit_transform(features)


In [17]:
features

array([[0.        , 0.35714286, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.64285714, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.42857143, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.51785714, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.33928571, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.64285714, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [18]:
# Division de l'ensemble de données en ensembles d'entraînement et de validation
X_train, X_valid, y_train, y_valid = train_test_split(features, y, test_size=0.2, random_state=42)

In [19]:
# Entraînement du modèle SVM
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

In [22]:
# Charger la nouvelle base de données sur laquelle vous souhaitez appliquer le SNM
new_data = pd.read_csv("resultat.csv")  # Remplacez "new_data.csv" par le nom de votre fichier de données
# Effectuer le même prétraitement que vous avez appliqué à la première base de données
new_data = new_data.drop('Cabin', axis=1)
age_median = new_data['Age'].median()
new_data['Age'].fillna(age_median, inplace=True)
most_frequent_embarked = new_data['Embarked'].mode()[0]
new_data['Embarked'].fillna(most_frequent_embarked, inplace=True)
new_features = new_data[["Name", "Sex", "Age", "Ticket"]].copy()
new_features["Sex"] = new_features["Sex"].map({"male": 0, "female": 1})
new_name_features = new_data["Name"].copy()
new_name_features.fillna("", inplace=True)
new_name_features = new_name_features.apply(lambda x: x.split())
new_name_encoded = tfidf.transform(new_name_features.apply(lambda x: " ".join(x)))
new_name_features = pd.DataFrame(new_name_encoded.toarray(), columns=name_columns)
new_features = pd.concat([new_features, new_name_features], axis=1)
new_features.drop("Name", axis=1, inplace=True)
# 
new_ticket_features = new_data["Ticket"].copy()
new_ticket_features.fillna("", inplace=True)
new_ticket_features = new_ticket_features.apply(lambda x: x.split())
new_ticket_encoded = tfidf.transform(new_ticket_features.apply(lambda x: " ".join(x)))
new_ticket_features = pd.DataFrame(new_ticket_encoded.toarray(), columns=ticket_columns)
new_features = pd.concat([new_features, new_ticket_features], axis=1)
new_features.drop("Ticket", axis=1, inplace=True)
# 
new_features = min_max_scaler.transform(new_features)

In [23]:
new_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Duplicate
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,2.0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence BriggsTha...",female,38.0,1,0,PC 17599,71.2833,0.0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,2.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,2.0,1
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,2.0,1
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,2.0,0
6,6,0,3,"Moran, Mr. James",male,29.2,0,0,330877,8.4583,1.0,0
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,2.0,1
8,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,2.0,1
9,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,2.0,0


In [24]:
# Prédiction sur l'ensemble de validation

y_pred = svm.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5



In [25]:
#st=time.time()
similarity_labels = svm.predict(new_features)
similarity_labels

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0], dtype=int64)

In [26]:
# Appliquer le SNM en utilisant les étiquettes de similarité prédites
def snm_duplicates(data, similarity_labels, threshold):
    n = len(data)
    duplicates = []

    for i in range(n):
        for j in range(i + 1, min(i + 3, n)):
            if similarity_labels[i] == similarity_labels[j]:
                if data['Name'][i] == data['Name'][j]:

#                 similarity = pairwise_distances([data[i]], [data[j]], metric='euclidean')
#                 if similarity < threshold:
                    duplicates.append((i, j))

    return duplicates

In [27]:
st=time.time()
# import csv

new_duplicates = snm_duplicates(new_data, similarity_labels, 1)
for i, j in new_duplicates:
    print("Duplicate pair:", i, j)
    print("Data 1:", new_data.iloc[i])
    print("Data 2:", new_data.iloc[j])
    print("--------------------")
# double = pd.read_csv("train_p.txt")
et=time.time()
et-st

Duplicate pair: 3 4
Data 1: PassengerId                                               4
Survived                                                  1
Pclass                                                    1
Name           Futrelle, Mrs. Jacques Heath (Lily May Peel)
Sex                                                  female
Age                                                    35.0
SibSp                                                     1
Parch                                                     0
Ticket                                               113803
Fare                                                   53.1
Embarked                                                2.0
Duplicate                                                 1
Name: 3, dtype: object
Data 2: PassengerId                                               4
Survived                                                  1
Pclass                                                    1
Name           Futrelle, Mrs. Jacques Hea

0.01600503921508789

In [28]:
new_duplicates

[(3, 4), (7, 8), (17, 18), (21, 22)]

In [29]:
new_y_valid=new_data['Duplicate']
print(classification_report(new_y_valid, similarity_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       1.00      1.00      1.00         8

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25

