In [3]:
import pandas as pd
import numpy as np

cols = ['methodology', 'requirements_volatility', 
                    'requirements_clarity', 'dev_time', 'project_size', 'team_size', 
                    'prod_complexity', 'testing_intensity', 'risk_analysis', 'user_participation',
                    'team_expertise', 'dev_expertise', 'doc_needed', 'fund_avail', 'delivery_speed','task_visualisation']
        
num_cols = [6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]

df = pd.read_csv('SDLC2.csv', names = cols, usecols=num_cols, header = 0)

df.head()


Unnamed: 0,methodology,requirements_volatility,requirements_clarity,dev_time,project_size,team_size,prod_complexity,testing_intensity,risk_analysis,user_participation,team_expertise,dev_expertise,doc_needed,fund_avail,delivery_speed,task_visualisation
0,Scrum,Changing,unknown/defined later in the lifecycle,Intensive,Medium,Medium (6-15),Simple,After each cycle (Intensive testing),Low,High,High,High,Low,High,Medium,Low
1,Kanban,Changing,unknown/defined later in the lifecycle,Intensive,Medium,Large (16....),Complex,After each cycle (Intensive testing),Medium,Low,High,High,Medium,High,Medium,High
2,Scrum,Changing,unknown/defined later in the lifecycle,Intensive,Medium,Medium (6-15),Simple,After each cycle (Intensive testing),Low,High,High,High,Low,High,Medium,Low
3,Hybrid: Scrum and Waterfall,Changing,understandable/early defined,Intensive,Medium,Large (16....),Complex,After each cycle (Intensive testing),Medium,High,Medium,High,High,High,Medium,Low
4,Hybrid: Scrum and Waterfall,Changing,understandable/early defined,Intensive,Medium,Medium (6-15),Complex,After each cycle (Intensive testing),High,Medium,High,High,Medium,High,Medium,Low


In [4]:
def convert_to_vectors(df):

        df['risk_analysis'] = df['risk_analysis'].map(dict(Low=1, Medium=2,High=3))
        df['user_participation'] = df['user_participation'].map(dict(Low=1, Medium=2,High=3))
        df['team_expertise'] = df['team_expertise'].map(dict(Low=1, Medium=2,High=3))
        df['dev_expertise'] = df['dev_expertise'].map(dict(Low=1, Medium=2,High=3))
        df['doc_needed'] = df['doc_needed'].map(dict(Low=1, Medium=2,High=3))
        df['fund_avail'] = df['fund_avail'].map(dict(Low=1, Medium=2,High=3))
        df['delivery_speed'] = df['delivery_speed'].map(dict(Low=1, Medium=2,High=3))
        df['task_visualisation'] = df['task_visualisation'].map(dict(Low=1, Medium=2,High=3))

        # project_type = {'Application (everything else)': 1,'System (sits between the hardware and the application software e.g. OSs)': 2,
        #                 'Utility (performs specific tasks to keep the computer running e.g. antivirus)':3}
        requirements_volatility = {'Changing': 1,'Fixed': 2}
        requirements_clarity = {'unknown/defined later in the lifecycle': 1,'understandable/early defined': 2}
        dev_time = {'Intensive':1, 'Non-Intensive':2}
        project_size = {'Small':1 , 'Medium':2, 'Large':3}
        team_size = {'Small (1-5)':1, 'Medium (6-15)':2, 'Large (16....)':3}
        prod_complexity = {'Simple':1, 'Complex':2}
        testing_intensity = {'After each cycle (Intensive testing)':1, 'After development is done (Non-intensive testing)':2}


        # df.project_type = [project_type[item] for item in df.project_type]
        df.requirements_volatility = [requirements_volatility[item] for item in df.requirements_volatility]
        df.requirements_clarity = [requirements_clarity[item] for item in df.requirements_clarity]
        df.dev_time = [dev_time[item] for item in df.dev_time]
        df.project_size = [project_size[item] for item in df.project_size]
        df.team_size = [team_size[item] for item in df.team_size]
        df.prod_complexity = [prod_complexity[item] for item in df.prod_complexity]
        df.testing_intensity = [testing_intensity[item] for item in df.testing_intensity]

        return df

In [5]:
conv = convert_to_vectors(df)
conv

Unnamed: 0,methodology,requirements_volatility,requirements_clarity,dev_time,project_size,team_size,prod_complexity,testing_intensity,risk_analysis,user_participation,team_expertise,dev_expertise,doc_needed,fund_avail,delivery_speed,task_visualisation
0,Scrum,1,1,1,2,2,1,1,1,3,3,3,1,3,2,1
1,Kanban,1,1,1,2,3,2,1,2,1,3,3,2,3,2,3
2,Scrum,1,1,1,2,2,1,1,1,3,3,3,1,3,2,1
3,Hybrid: Scrum and Waterfall,1,2,1,2,3,2,1,2,3,2,3,3,3,2,1
4,Hybrid: Scrum and Waterfall,1,2,1,2,2,2,1,3,2,3,3,2,3,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,Waterfall,2,2,2,1,1,1,2,3,1,2,2,3,1,1,1
122,Waterfall,2,2,2,1,1,1,2,3,1,2,2,3,1,1,1
123,Waterfall,2,2,2,1,1,1,2,3,1,2,2,3,1,1,1
124,Waterfall,2,2,2,1,1,1,2,3,1,2,2,3,1,1,1


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X = df.drop('methodology',axis=1)
y = df[['methodology']]
k = 5
kf = KFold(n_splits=k, random_state=None)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=42)
acc_score = []
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
    clf_model = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=17, min_samples_leaf=1)   
    clf_model.fit(X_train,y_train)
    y_predict = clf_model.predict(X_test)

    acc = accuracy_score(y_test,y_predict)
    acc_score.append(acc)
    
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.5384615384615384, 0.92, 0.8, 1.0, 1.0]
Avg accuracy : 0.8516923076923077


In [7]:
def perform_D3(data):
    cl_dataset = convert_to_vectors(data)    
    X = cl_dataset.drop('methodology',axis=1)
    y = cl_dataset[['methodology']]
    k = 5
    kf = KFold(n_splits=k, random_state=None)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=42)
    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
        model = DecisionTreeClassifier(criterion="entropy", random_state=42,max_depth=17, min_samples_leaf=1)  

        model.fit(X_train,y_train)
        y_predict = model.predict(X_test)

        acc = accuracy_score(y_test,y_predict)
        acc_score.append(acc)
    avg_acc_score = sum(acc_score)/k

    print_fold_accuracy = print('accuracy of each fold - {}'.format(acc_score))
    print_avg_accuracy = print('Avg accuracy : {}'.format(avg_acc_score))
    
    return y_predict

In [8]:
D3 = perform_D3(df)
D3

KeyError: 1

In [9]:
y_predict = clf_model.predict(X_test)

accuracy_score(y_test,y_predict)
y_test, y_predict
test_frame = pd.DataFrame(y_test)
pred_frame = pd.DataFrame(y_predict)
results = pd.concat([test_frame, pred_frame], axis=1)
results

Unnamed: 0,methodology,0
0,,RAD
1,,RAD
2,,Waterfall
3,,Waterfall
4,,Waterfall
5,,Waterfall
6,,Waterfall
7,,Waterfall
8,,Waterfall
9,,Waterfall


In [10]:
target = list(df['methodology'].unique())
feature_names = list(X.columns)

In [11]:
from sklearn.tree import export_text
r = export_text(clf_model, feature_names=feature_names)
print(r)

|--- doc_needed <= 1.50
|   |--- project_size <= 2.50
|   |   |--- user_participation <= 2.50
|   |   |   |--- class: RAD
|   |   |--- user_participation >  2.50
|   |   |   |--- class: Scrum
|   |--- project_size >  2.50
|   |   |--- delivery_speed <= 2.50
|   |   |   |--- class: Spiral
|   |   |--- delivery_speed >  2.50
|   |   |   |--- class: Hybrid: Scrum and Kanban
|--- doc_needed >  1.50
|   |--- risk_analysis <= 1.50
|   |   |--- user_participation <= 2.50
|   |   |   |--- class: Hybrid: Scrum and Kanban
|   |   |--- user_participation >  2.50
|   |   |   |--- class: RAD
|   |--- risk_analysis >  1.50
|   |   |--- task_visualisation <= 2.00
|   |   |   |--- fund_avail <= 2.50
|   |   |   |   |--- project_size <= 1.50
|   |   |   |   |   |--- class: Waterfall
|   |   |   |   |--- project_size >  1.50
|   |   |   |   |   |--- delivery_speed <= 2.00
|   |   |   |   |   |   |--- class: RAD
|   |   |   |   |   |--- delivery_speed >  2.00
|   |   |   |   |   |   |--- class: Hybrid: S

In [12]:
f_names = ['requirements_volatility', 
        'requirements_clarity', 'dev_time', 'project_size', 'team_size', 
        'prod_complexity', 'testing_intensity', 'risk_analysis', 'user_participation',
        'team_expertise', 'dev_expertise', 'doc_needed', 'fund_avail', 'delivery_speed', 'task_visualisation']
c_names = ['Waterfall', 'Scrum', 'Kanban', 'Hybrid: Scrum and Kanban', 'Hybrid: Scrum and Waterfall', 'Spiral', 'RAD' ]

In [13]:
from sklearn.tree import _tree

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]
        
    return rules

In [15]:
rules = get_rules(clf_model, f_names, c_names)
for r in rules:
    print(r)

IndexError: list index out of range