In [1]:
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
data = pd.read_csv('smoking1.csv')

# Convert 'smoke' column to Boolean
data['smoke'] = data['smoke'].map({'Yes': True, 'No': False})

# Handle missing values
data['amt_weekends'].fillna(0, inplace=True)
data['amt_weekdays'].fillna(0, inplace=True)
data.head()

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type
0,Male,38,Divorced,No Qualification,British,White,"2,600 to 5,200",The North,False,0.0,0.0,
1,Female,42,Single,No Qualification,British,White,"Under 2,600",The North,True,12.0,12.0,Packets
2,Male,40,Married,Degree,English,White,"28,600 to 36,400",The North,False,0.0,0.0,
3,Female,40,Married,Degree,English,White,"10,400 to 15,600",The North,False,0.0,0.0,
4,Female,39,Married,GCSE/O Level,British,White,"2,600 to 5,200",The North,False,0.0,0.0,


In [2]:

# Calculate the total weekly smoking amount
data['total_weekly_smoking'] = data['amt_weekends'] + data['amt_weekdays']

# Define a function to categorize smoking intensity
def categorize_smoking_intensity(row):
    if row['total_weekly_smoking'] == 0:
        return 'Non-Smoker'
    elif 1 <= row['total_weekly_smoking'] <= 10:
        return 'Light Smoker'
    elif 11 <= row['total_weekly_smoking'] <= 20:
        return 'Moderate Smoker'
    else:
        return 'Heavy Smoker'

# Apply the function to create the new 'smoking_intensity' column
data['smoking_intensity'] = data.apply(categorize_smoking_intensity, axis=1)
# data_original stored for testing for compare result
# data_original = data
data.head()

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type,total_weekly_smoking,smoking_intensity
0,Male,38,Divorced,No Qualification,British,White,"2,600 to 5,200",The North,False,0.0,0.0,,0.0,Non-Smoker
1,Female,42,Single,No Qualification,British,White,"Under 2,600",The North,True,12.0,12.0,Packets,24.0,Heavy Smoker
2,Male,40,Married,Degree,English,White,"28,600 to 36,400",The North,False,0.0,0.0,,0.0,Non-Smoker
3,Female,40,Married,Degree,English,White,"10,400 to 15,600",The North,False,0.0,0.0,,0.0,Non-Smoker
4,Female,39,Married,GCSE/O Level,British,White,"2,600 to 5,200",The North,False,0.0,0.0,,0.0,Non-Smoker


In [3]:
# Filtering tags below for to better performance. 
data = data.drop(columns=["age", "amt_weekends", "amt_weekdays","type","total_weekly_smoking"])

In [4]:
# One-hot encoding using pd.get_dummies for simplicity
# Including 'smoking_intensity' and other categorical variables that need to be one-hot encoded
#data_encoded = pd.get_dummies(data.drop(columns=['total_weekly_smoking']), drop_first=True)
data_encoded = pd.get_dummies(data, drop_first=True)

# data_original_encoded stored for testing for compare result
# data_original_encoded = pd.get_dummies(data_original, drop_first=True)

# Apply the Apriori algorithm on the encoded DataFrame
frequent_itemsets = apriori(data_encoded.astype(bool), min_support=0.2, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Display whole content of row
pd.set_option('display.max_colwidth', None)

# Display the rules sorted by confidence
#print(rules[['antecedents', 'consequents', 'support', 'confidence']].sort_values(by='support', ascending=False)[:10])
print(rules[['antecedents', 'consequents', 'support', 'confidence']].sort_values(by='confidence',ascending=False)[:10])
print(len(rules))

                                                               antecedents  \
54                           (nationality_English, marital_status_Married)   
79                     (nationality_English, smoking_intensity_Non-Smoker)   
22                                                   (nationality_English)   
42                                      (gender_Male, nationality_English)   
30                                                      (region_The North)   
73  (highest_qualification_No Qualification, smoking_intensity_Non-Smoker)   
18                                (highest_qualification_No Qualification)   
0                                                                  (smoke)   
27                                          (gross_income_5,200 to 10,400)   
33                                          (smoking_intensity_Non-Smoker)   

          consequents   support  confidence  
54  (ethnicity_White)  0.231224    0.975062  
79  (ethnicity_White)  0.363690    0.974643  
22 

In [5]:
# Filter for rules where the consequent is exactly 'smoke' or 'smoke_True'
# Adjust the string based on how smoking is represented in your encoded DataFrame
rules_with_smoke_as_consequent = rules[rules['consequents'] == {'smoke'}]  

# Display the filtered rules
print(rules_with_smoke_as_consequent[['antecedents', 'consequents', 'support', 'confidence']].sort_values(by='confidence', ascending=False))
print('total=',len(rules_with_smoke_as_consequent))

         antecedents consequents   support  confidence
1  (ethnicity_White)     (smoke)  0.230633        0.25
total= 1


In [6]:
from collections import defaultdict

# Initialize a dictionary to count occurrences of each item in antecedents
item_counts = defaultdict(int)

# Iterate over each row in the rules DataFrame
for index, row in rules.iterrows():
    # Update the count for each item found in the antecedents of this rule
    for item in row['antecedents']:
        item_counts[item] += 1

# Convert the dictionary to a list of (item, count) tuples and sort it by count in descending order
sorted_item_counts = sorted(item_counts.items(), key=lambda x: x[1], reverse=True)

# Display the most frequently appearing items in antecedents
print("Most frequently appearing items in antecedents:")
for item, count in sorted_item_counts[:10]:  # Adjust the slice as needed to display more or fewer items
    print(f"{item}: {count}")

Most frequently appearing items in antecedents:
ethnicity_White: 31
smoking_intensity_Non-Smoker: 21
marital_status_Married: 16
nationality_English: 16
gender_Male: 13
highest_qualification_No Qualification: 5
region_Midlands & East Anglia: 2
smoke: 1
marital_status_Single: 1
gross_income_5,200 to 10,400: 1


In [7]:
non_binary_numeric_columns = []

# Iterate through each column in the DataFrame
for column in data_encoded.columns:
    # Check if the column is of a numeric data type
    if pd.api.types.is_numeric_dtype(data_encoded[column]):
        # Count the number of unique values in the column
        unique_values = data_encoded[column].nunique()
        # If the number of unique values is more than 2, it's not a binary column
        if unique_values > 2:
            non_binary_numeric_columns.append(column)

print("Non-binary numeric columns:", non_binary_numeric_columns)


# Exclude any non-binary numeric columns explicitly or transform them before this step
data_for_apriori = data_encoded.drop(columns=non_binary_numeric_columns)  # Adjust column names as necessary

# Convert the DataFrame to boolean type if not already
data_for_apriori = data_for_apriori.astype(bool)

frequent_itemsets = apriori(data_for_apriori, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

print(rules[['antecedents', 'consequents', 'support', 'confidence']].sort_values(by='confidence', ascending=False)[:10])

Non-binary numeric columns: []
                                                                                                     antecedents  \
135                                                                                            (region_Scotland)   
19                                                                           (smoking_intensity_Moderate Smoker)   
618                                                                      (region_Scotland, nationality_Scottish)   
659                                                              (region_Scotland, smoking_intensity_Non-Smoker)   
122                                                                                       (nationality_Scottish)   
1269  (region_Midlands & East Anglia, nationality_English, marital_status_Married, smoking_intensity_Non-Smoker)   
204                                                         (smoking_intensity_Moderate Smoker, ethnicity_White)   
624                                      

In [8]:
# Supervised Learning
# Method: classification
# Model: Decision Tree{classification tree}
# Algorithm:CART 
# CART:produces only binary trees, meaning trees where split nodes always have exactly two children (i.e., questions only have yes/no answers). 
#      However, ID3 can produce decision trees with nodes that have more than two children, since each node has as many
#      splits/branches as there are categories. (Source: https://learning.oreilly.com/library/view/hands-on-machine-learning/9781492032632/)
#      Moreover, CART uses Gini impurity and cross-entropy for split criterion in classification, while ID3 uses information gain. 
#      (Source: Machine Learning university course).
# Step 1 : Data preparation 
# Based on Unsupervised Apriori analysis, select the features identified as relevant, such as 'ethnicity','nationality'
from sklearn.model_selection import train_test_split
X = data_encoded.drop('smoke', axis=1)
y = data_encoded['smoke']  
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
# Model Training
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=42)

# Train the model
dt_classifier.fit(X_train, y_train)

In [10]:
# Model Evaluation
from sklearn.metrics import accuracy_score, classification_report

# Predictions
y_pred_train = dt_classifier.predict(X_train)
y_pred_test = dt_classifier.predict(X_test)

# Evaluation
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Testing Set):\n", classification_report(y_test, y_pred_test))


Training Accuracy: 1.0
Testing Accuracy: 0.9970501474926253

Classification Report (Testing Set):
               precision    recall  f1-score   support

       False       1.00      1.00      1.00       255
        True       0.99      1.00      0.99        84

    accuracy                           1.00       339
   macro avg       0.99      1.00      1.00       339
weighted avg       1.00      1.00      1.00       339



In [11]:
feature_importances = pd.DataFrame(dt_classifier.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances.head())  # Adjust as needed to view more features


                                        importance
smoking_intensity_Non-Smoker              0.989564
gross_income_Refused                      0.003514
region_Midlands & East Anglia             0.002515
highest_qualification_No Qualification    0.001620
marital_status_Married                    0.001395
