In [None]:
import pandas as pd
import math
from sklearn import neighbors, datasets
from numpy.random import permutation
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

cols = ['methodology', 'project_type', 'requirements_volatility', 
        'requirements_clarity', 'dev_time', 'project_size', 'team_size', 
        'prod_complexity', 'testing_intensity', 'risk_analysis', 'user_participation',
        'team_expertise', 'dev_expertise', 'doc_needed', 'fund_avail', 'delivery_speed']
        
num_cols = [6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]

df = pd.read_csv('SDLC.csv', names = cols, usecols=num_cols, header = 0)

df.head()

In [None]:
df['risk_analysis'] = df['risk_analysis'].map(dict(Low=0, Medium=1,High=2))
df['user_participation'] = df['user_participation'].map(dict(Low=0, Medium=1,High=2))
df['team_expertise'] = df['team_expertise'].map(dict(Low=0, Medium=1,High=2))
df['dev_expertise'] = df['dev_expertise'].map(dict(Low=0, Medium=1,High=2))
df['doc_needed'] = df['doc_needed'].map(dict(Low=0, Medium=1,High=2))
df['fund_avail'] = df['fund_avail'].map(dict(Low=0, Medium=1,High=2))
df['delivery_speed'] = df['delivery_speed'].map(dict(Low=0, Medium=1,High=2))

df.head()

In [None]:
project_type = {'Application (everything else)': 0,'System (sits between the hardware and the application software e.g. OSs)': 1,
                 'Utility (performs specific tasks to keep the computer running e.g. antivirus)':2}
requirements_volatility = {'Changing': 0,'Fixed': 1}
requirements_clarity = {'unknown/defined later in the lifecycle': 0,'understandable/early defined': 1,}
dev_time = {'Intensive':0, 'Non-Intensive':1}
project_size = {'Small':0 , 'Medium':1, 'Large':2}
team_size = {'Small (1-5)':0, 'Medium (6-15)':1, 'Large (16....)':2}
prod_complexity = {'Simple':0, 'Complex':1}
testing_intensity = {'After each cycle (Intensive testing)':0, 'After development is done (Non-intensive testing)':1}


df.project_type = [project_type[item] for item in df.project_type]
df.requirements_volatility = [requirements_volatility[item] for item in df.requirements_volatility]
df.requirements_clarity = [requirements_clarity[item] for item in df.requirements_clarity]
df.dev_time = [dev_time[item] for item in df.dev_time]
df.project_size = [project_size[item] for item in df.project_size]
df.team_size = [team_size[item] for item in df.team_size]
df.prod_complexity = [prod_complexity[item] for item in df.prod_complexity]
df.testing_intensity = [testing_intensity[item] for item in df.testing_intensity]

df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('methodology',axis=1)
y = df[['methodology']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=42)
# def SVM_classifier(train_input_data,train_output_data,test_input_data,test_output_data):
clf = svm.SVC()
clf.fit(X_train,y_train)
predicted_output = clf.predict(X_test)
error_list = []
predicted_output = predicted_output.tolist()
test_output_data  = y_test.values.tolist()
for i in range(len(test_output_data)):
    cur_sdlc_similarities =  df[df['methodology'] == predicted_output[i]]
    cur_sdlc_similarity_list = cur_sdlc_similarities.values.tolist()
    cur_sdlc_similarity_list = [item for sublist in cur_sdlc_similarity_list for item in sublist]
    if test_output_data[i] in cur_sdlc_similarity_list[1:]:
        error_list.append(0)
    else:
        error_list.append(1)
predicted_output

In [None]:
neighbour_list = []
accuracy_percent = []
for neighbours in range(1,101,5):
    clf = neighbors.KNeighborsClassifier(neighbours, weights='uniform')
    clf.fit(X_train, y_train)
    predicted_output = clf.predict(X_test)
    if isinstance(predicted_output,list) ==False:
        predicted_output = predicted_output.tolist()
    if isinstance(test_output_data,list) ==False:
        test_output_data = y_test.values.tolist()
    error_list = []
    for i in range(len(test_output_data)):
        cur_sdlc_similarities =  df[df['methodology'] == predicted_output[i]]
        cur_sdlc_similarity_list = cur_sdlc_similarities.values.tolist()
        cur_sdlc_similarity_list = [item for sublist in cur_sdlc_similarity_list for item in sublist]
        if test_output_data[i] in cur_sdlc_similarity_list[1:]:
            error_list.append(0)
        else:
            error_list.append(1)
    neighbour_list.append(neighbours)
    accuracy_percent.append(100 -((sum(error_list)/float(len(error_list))) * 100))
neighbour_list = np.array(neighbour_list)
accuracy_percent = np.array(accuracy_percent)
plt.plot(neighbour_list,accuracy_percent)
plt.xlabel('Number of nearest neighbors')
plt.ylabel('Percent of accuracy')
plt.title('Varation of accuracy with nearest neighbours')
plt.grid(True)
plt.savefig("knn1.png")
plt.show()
predicted_output

In [None]:
def Random_Forest_classifier(train_input_data,train_output_data,test_input_data,test_output_data):
    tree_list = []
    accuracy_percent = []
    for trees in range(10,200,10):
        clf = RandomForestClassifier(trees)
        clf.fit(train_input_data,train_output_data)
        predicted_output = clf.predict(test_input_data)
        error_list = []
        if isinstance(predicted_output,list) ==False:
            predicted_output = predicted_output.tolist()
        if isinstance(test_output_data,list) ==False:
            test_output_data = test_output_data.tolist()
        for i in range(len(test_output_data)):
            cur_univ_similarities =  similar_univs[similar_univs['univName'] == predicted_output[i]]
            cur_univ_similarity_list = cur_univ_similarities.values.tolist()
                cur_univ_similarity_list = [item for sublist in cur_univ_similarity_list for item in sublist]
            if test_output_data[i] in cur_univ_similarity_list[1:]:
                error_list.append(0)
            else:
                error_list.append(1)
        tree_list.append(trees)
        accuracy_percent.append(100 -((sum(error_list)/float(len(error_list))) * 100))
        tree_list = np.array(tree_list)
        accuracy_percent = np.array(accuracy_percent)
        plt.plot(tree_list,accuracy_percent)
        plt.xlabel('Number of trees')
        plt.ylabel('Percent of accuracy')
        plt.title('Varation of accuracy with trees')
        plt.grid(True)
        plt.savefig("rf1.png")
        plt.show()
        return predicted_output

NLP

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

PMMname = df['PMMname']
characteristics = df['characteristics']
PMM = 'Kanban'


df = pd.DataFrame({'title': PMMname, 'characteristics': characteristics})
df = df[['title', 'characteristics']]

# initialize the new column to hold found keywords
df['Key_words'] = ""

# loop the dataframe created
for index, row in df.iterrows():
    # initialize vectorizer for NLP
    vectorizer = TfidfVectorizer()

    # extracting the words by passing the text
    vectorizer.fit_transform(row)

    # getting the dictionary with key words as keys and their scores as values
    key_words_dict_scores = vectorizer.get_feature_names()

    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores)

# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['Key_words'].astype('str'))

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# gettin the index of the activity that matches the title
idx = df.index[df['title'] == PMM].tolist()[0]

# creating a Series with the similarity scores in descending order
score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)

# getting the indexes of the 5 most similar activities
top_3_indexes = list(score_series.iloc[1:4].index)

# initializing the empty list to hold the recommendations
recommended_activities = list()
for i in top_3_indexes:
    recommended_activities.append((df['title'].loc[i]))

# now remove the original activity to not recommend it back to the user
# fix here
if recommended_activities.__contains__(PMM):
    recommended_activities.remove(PMM)

# append recommendations with comma
