**Download data**

In [63]:
import pandas as pd
data = pd.read_csv('train_data.csv')
print(data.head())

   id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0   1         M       11.840         18.70           77.93      440.6   
1   2         B       12.760         18.84           81.87      496.6   
2   3         M       23.290         26.67          158.90     1685.0   
3   4         B       12.560         19.07           81.92      485.8   
4   5         B        9.742         15.67           61.50      289.9   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11090           0.15160         0.12180              0.05182   
1          0.09676           0.07952         0.02688              0.01781   
2          0.11410           0.20840         0.35230              0.16200   
3          0.08760           0.10380         0.10300              0.04391   
4          0.09037           0.04689         0.01103              0.01407   

   ...  radius_worst  texture_worst  perimeter_worst  area_worst  \
0  ...         16.82          

**Checking for missing values**

In [64]:
print(data.info())  # Overview of columns and types
print(data.describe())  # Statistical summary
print(data.isnull().sum())  # Check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       400 non-null    int64  
 1   diagnosis                400 non-null    object 
 2   radius_mean              400 non-null    float64
 3   texture_mean             400 non-null    float64
 4   perimeter_mean           400 non-null    float64
 5   area_mean                400 non-null    float64
 6   smoothness_mean          400 non-null    float64
 7   compactness_mean         400 non-null    float64
 8   concavity_mean           400 non-null    float64
 9   concave points_mean      400 non-null    float64
 10  symmetry_mean            400 non-null    float64
 11  fractal_dimension_mean   400 non-null    float64
 12  radius_se                400 non-null    float64
 13  texture_se               400 non-null    float64
 14  perimeter_se             4

In [65]:
data = data.drop('id', axis=1)

In [66]:
print(data.head())

  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0         M       11.840         18.70           77.93      440.6   
1         B       12.760         18.84           81.87      496.6   
2         M       23.290         26.67          158.90     1685.0   
3         B       12.560         19.07           81.92      485.8   
4         B        9.742         15.67           61.50      289.9   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11090           0.15160         0.12180              0.05182   
1          0.09676           0.07952         0.02688              0.01781   
2          0.11410           0.20840         0.35230              0.16200   
3          0.08760           0.10380         0.10300              0.04391   
4          0.09037           0.04689         0.01103              0.01407   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2301  ...         16.82          28.12 

**Data spliting**

In [67]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Split the data
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)



**Feature Scaling**

In [68]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

In [69]:
print(X_resampled_scaled)

[[-0.64502305 -0.15511094 -0.61801263 ... -0.55175756 -1.35835563
  -0.71509895]
 [-0.45372907 -1.40337625 -0.46863784 ... -0.5654998  -0.33626973
  -0.64192628]
 [ 0.78702492  0.06421044  0.62793521 ... -0.16563061  3.23022841
  -1.31212141]
 ...
 [ 0.3482114   0.42127804  0.38265885 ... -0.08858885 -0.23486265
  -0.12446054]
 [ 1.13692956 -0.23876394  1.13459247 ...  0.60201517 -0.53259929
  -0.95752739]
 [ 0.08402194 -1.11033384  0.16542197 ...  1.58540814  2.48134235
   0.69177894]]


**Model Trainning**

In [78]:
from sklearn.svm import SVC
from xgboost import XGBClassifier

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_resampled_scaled, y_resampled)
svm_y_pred = svm_model.predict(X_test_scaled)


In [79]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy
svm_accuracy = accuracy_score(y_test, svm_y_pred)

# Convert to percentage
svm_accuracy_percentage = svm_accuracy * 100

print(f'SVM Accuracy: {svm_accuracy_percentage:.2f}%')


SVM Accuracy: 98.75%


In [80]:
from sklearn.metrics import classification_report

print(classification_report(y_test, svm_y_pred, target_names=['malignant', 'benign']))


              precision    recall  f1-score   support

   malignant       1.00      0.98      0.99        59
      benign       0.95      1.00      0.98        21

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.98        80
weighted avg       0.99      0.99      0.99        80



In [81]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(svm_model, X_resampled_scaled, y_resampled, cv=5)
print(f'Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%')


Cross-Validation Accuracy: 98.19%


**Testing**

In [83]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


def predict_diagnosis(features):
    # Convert the features list to a numpy array and reshape it
    features_array = np.array(features).reshape(1, -1)
    
    # Scale the input features using the same scaler you used for training
    scaled_features = scaler.transform(features_array)
    
    # Make the prediction
    prediction = svm_model.predict(scaled_features) 

    if prediction[0] == 'M':
        return 'malignant'
    else:
        return 'benign'
    

input_features = [12.76,18.84,81.87,496.6,0.09676,0.07952,0.02688,0.01781,0.1759,0.06183,0.2213,1.285,1.535,17.26,0.005608,0.01646,0.01529,0.009997,0.01909,0.002133,13.75,25.99,87.82,579.7,0.1298,0.1839,0.1255,0.08312,0.2744,0.07238]

# Get the prediction
result = predict_diagnosis(input_features)
print(f'The predicted diagnosis is: {result}')


The predicted diagnosis is: benign




In [85]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC



def predict_diagnosis(features):
    # Convert the features list to a numpy array and reshape it
    features_array = np.array(features).reshape(1, -1)
    
    # Scale the input features using the same scaler you used for training
    scaled_features = scaler.transform(features_array)
    
    # Make the prediction
    prediction = svm_model.predict(scaled_features) 

    if prediction[0] == 'M':
        return 'malignant'
    else:
        return 'benign'
    

input_features = [13.82,24.49,92.33,595.9,0.1162,0.1681,0.1357,0.06759,0.2275,0.07237,0.4751,1.528,2.974,39.05,0.00968,0.03856,0.03476,0.01616,0.02434,0.006995,16.01,32.94,106.0,788.0,0.1794,0.3966,0.3381,0.1521,0.3651,0.1183]

# Get the prediction
result = predict_diagnosis(input_features)
print(f'The predicted diagnosis is: {result}')


The predicted diagnosis is: malignant


