In [1]:
import pandas as pd
import math
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import coo_matrix, hstack, csr_matrix
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import time

In [2]:
train_data_path = "DrugsComTrain.csv"
validation_data_path = "DrugsComVal.csv"
test_data_path = "DrugsComTest.csv"
paths = (train_data_path, validation_data_path, test_data_path)
output_folder_path = "output"
question_number = 'a'

In [3]:
def clean(review, stopwords):
    tokens = review.lower().replace('&#039;', "").replace('.', " ").replace(',', " ").replace('"', " ").replace('\r', " ").replace('\n', " ").replace('-', " ").split()
    new_review = ""
    for word in tokens:
        if word not in stopwords:
            new_review+=word + " "
    return new_review[:-1]

def get_vocab(st):
    vectorizer = CountVectorizer()
    st = vectorizer.fit_transform(st)
    vocab = vectorizer.get_feature_names_out()
    return st, vocab, vectorizer

def extract_string_feature(feature, stop_words, train, vectorizer):
    for i in range(len(feature)):
        if type(feature[i])==float and math.isnan(feature[i]):
            feature[i] = ""
        else:
            feature[i] = clean(feature[i], stop_words)

    if train:
        return get_vocab(feature)
    else:
        return vectorizer.transform(feature), None, vectorizer

def get_date(l):
    months = {"January": 1, "February":2, "March":3, "April":4, "May":5, "June":6, "July":7, "August":8, "September":9, "October":10, "November":11, "December":12}
    dates = np.zeros((len(l), 3))
    for i in range(len(l)):
        temp = l[i].split()
        month = months[temp[0]]
        date = int(temp[1][:-1])
        year = int(temp[2])
        dates[i] = np.array([date, month, year])
    return csr_matrix(dates)

def get_arrays(data, train, vect, stop_words):
    conditions, cond_vocab, vect[0] = extract_string_feature(list(data["condition"]), stop_words, train, vect[0])
    reviews, reviews_vocab, vect[1] = extract_string_feature(list(data["review"]), stop_words, train, vect[1])
    dates = get_date(list(data["date"])) 
    usefulCount = csr_matrix(np.array(data['usefulCount']).reshape(dates.shape[0], 1))
    X = hstack([reviews, conditions, dates, usefulCount])
    y = np.array(data["rating"])
    y = y-1
    return (X,y),vect

def get_data(paths):
    stop_words = set(stopwords.words('english'))
    vect = [None, None]
    full_data = []
    for i in range(3):
        data, vect = get_arrays(pd.read_csv(paths[i]), i==0, vect, stop_words)
        full_data.append(data)

    return tuple(full_data)

def plot_alpha_graphs(alphas, metrics, scores):
    train_scores, validation_scores, test_scores = scores
    plt.xlabel("Alphas")
    plt.ylabel("Accuracies")
    plt.title("Accuracy vs Alpha for Training and Test sets")
    plt.plot(alphas, train_scores, marker="o", label="Train", drawstyle="steps-post")
    plt.plot(alphas, validation_scores, marker="o", label="Validation", drawstyle="steps-post")
    plt.plot(alphas, test_scores, marker="o", label="Test", drawstyle="steps-post")
    plt.legend()
    plt.savefig('Plots/Dataset2/Accuracy')
    plt.clf()

    impurities, nodes, depth = metrics
    
    plt.xlabel("Alphas")
    plt.ylabel("Impurities")
    plt.title("Impurity vs Alpha")
    plt.plot(alphas, impurities, marker="o", label="Impurities", drawstyle="steps-post")
    plt.legend()
    plt.savefig('Plots/Dataset2/Impurity')
    plt.clf()


    plt.xlabel("Alphas")
    plt.ylabel("Nodes")
    plt.title("Nodes vs Alpha")
    plt.plot(alphas, nodes, marker="o", label="Nodes", drawstyle="steps-post")
    plt.legend()
    plt.savefig('Plots/Dataset2/Nodes')
    plt.clf()

    plt.xlabel("Alphas")
    plt.ylabel("Depth")
    plt.title("Depth vs Alpha")
    plt.plot(alphas, depth, marker="o", label="Depth", drawstyle="steps-post")
    plt.legend()
    plt.savefig('Plots/Dataset2/Depth')
    plt.clf()

def model(clf, data):
    X_train,y_train = data[0]
    clf.fit(X_train, y_train)
    acc = []
    for X,y in data:
        acc.append(clf.score(X,y))
    return acc

def part_a(data):
    X_train, y_train = data[0]
    clf = tree.DecisionTreeClassifier(random_state = 0)
    acc = model(clf, data)
    lines = ['Training Accuracy is: ', 'Validation Accuracy is: ', 'Test Accuracy is: ']
    lines = [lines[i] + str(np.round(acc[i]*100,5)) + '\n' for i in range(len(lines))]
    lines = ["Results for Part a:\n"] + lines + ["\n"]
    return lines

def get_part_b_params(train_data):
    clf = tree.DecisionTreeClassifier()
    param_dict = {"max_depth": range(1,5), 
                 "min_samples_split": range(2,10), 
                 "min_samples_leaf": range(1,5)
                 }

    grid = GridSearchCV(clf, param_grid = param_dict, cv = 10, verbose = 1, n_jobs = -1)
    X_train,y_train = train_data
    grid.fit(X_train, y_train)
    p = grid.best_params_
    return p['max_depth'], p['min_samples_leaf'], p['min_samples_split']

def part_b(data):
    params = get_part_b_params(data[0])
    X_train, y_train = data[0]
    clf = tree.DecisionTreeClassifier(random_state = 0, max_depth = params[0], min_samples_leaf = params[1], min_samples_split = params[2])
    acc = model(clf, data)
    lines = ['Training Accuracy is: ', 'Validation Accuracy is: ', 'Test Accuracy is: ']
    lines = [lines[i] + str(np.round(acc[i]*100,5)) + '\n' for i in range(len(lines))]
    lines = ["Results for Part b:\n"] + lines + ["\n"]
    return lines

def get_scores(data, clfs):
    train_data, validation_data, test_data = data
    X_train, y_train = train_data
    X_validation, y_validation = validation_data
    X_test, y_test = test_data 
    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    validation_scores = [clf.score(X_validation, y_validation) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]
    
    return train_scores, validation_scores, test_scores

def get_part_c_params(train_data):
    X_train, y_train = train_data
    clf = tree.DecisionTreeClassifier(random_state=0)
    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    clfs = []
    nodes = []
    depth = []
    for ccp_alpha in ccp_alphas:
        clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        clf.fit(X_train, y_train)
        clfs.append(clf)
        nodes.append(clf.tree_.node_count)
        depth.append(clf.tree_.max_depth)

    ccp_alphas = ccp_alphas[:-1]
    clfs = clfs[:-1]
    impurities = impurities[:-1]
    nodes = nodes[:-1]
    depth = depth[:-1]

    metrics = (impurities, nodes, depth)
    
    return ccp_alphas, clfs, metrics

def part_c(data):
    ccp_alphas, clfs, metrics = get_part_c_params(data[0])
    scores = get_scores(data, clfs)

    plot_alpha_graphs(ccp_alphas, metrics, scores)
    index = np.argmax(np.array(scores[1]))
    best_alpha, best_tree = ccp_alphas[index], clfs[index]
    acc = scores[0][index], scores[1][index], scores[2][index]
    
    lines = ['Training Accuracy is: ', 'Validation Accuracy is: ', 'Test Accuracy is: ']
    lines = [lines[i] + str(np.round(acc[i]*100,5)) + '\n' for i in range(len(lines))]
    lines.append('Best Alpha = ' + str(np.round(best_alpha,5)) + '\n')
    lines = ["Results for Part c:\n"] + lines + ["\n"]
    return lines

def get_part_d_params(X,y):
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    #check max features
    l = [0.4, 0.5, 0.6, 0.7, 0.8]
    param_dict = {"n_estimators": range(50,450, 50), 
                    "max_features": ["sqrt", "log2", None], 
                    "min_samples_split": range(2,10,2)
                    }
    grid = GridSearchCV(clf, param_grid = param_dict, cv = 10, verbose = 1, n_jobs = -1)
    grid.fit(X,y)
    p = grid.best_params_
    return p["n_estimators"], p['min_samples_split'], p['max_features']

def part_d(data):
    X_train,y_train = data[0]
    params = get_part_d_params(X_train, y_train)
    clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators = params[0], min_samples_split = params[1], max_features = params[2], oob_score = True)
    clf.fit(X_train, y_train)
    acc = []
    for X,y in data:
        acc.append(clf.score(X,y))

    lines = ['Training Accuracy is: ', 'Validation Accuracy is: ', 'Test Accuracy is: ']
    lines = [lines[i] + str(np.round(acc[i]*100,5)) + '\n' for i in range(len(lines))]
    
    lines.append('Out of bag Accuracy: ' + str(np.round(clf.oob_score_*100,5)) + '\n')
    lines = ["Results for Part d:\n"] + lines + ["\n"]
    return lines

def best_part_e_params(X,y):
    estimator = XGBClassifier(objective= 'binary:logistic', nthread=4,seed=42)
    l = [0.4, 0.5, 0.6, 0.7, 0.8]
    param_dict = {"n_estimators": range(50,450,50), 
                     "subsample": l, 
                     "max_depth": range(40,70,10)
                     }

    grid = GridSearchCV(estimator=estimator, param_grid=param_dict, scoring = 'roc_auc', n_jobs = 10, cv = 10, verbose=True)
    grid.fit(X,y)
    p = grid.best_params_
    return p["n_estimators"], p["subsample"], p["max_depth"]

def part_e(data):
    X_train, y_train = data[0]
    params = best_part_e_params(X_train, y_train)
    clf = XGBClassifier(objective= 'binary:logistic', nthread=4,seed=42, n_estimators = params[0], subsample = params[1], max_depth = params[2])
    clf.fit(X_train, y_train)
    acc = []
    for X,y in data:
        acc.append(clf.score(X,y))
    
    lines = ['Training Accuracy is: ', 'Validation Accuracy is: ', 'Test Accuracy is: ']
    lines = [lines[i] + str(np.round(acc[i]*100,5)) + '\n' for i in range(len(lines))]
    
    return lines

In [4]:
data = get_data(paths)
begin = time.time()
result = part_a(data)
print(result)
print(time.time()-begin)

['Results for Part a:\n', 'Training Accuracy is: 100.0\n', 'Validation Accuracy is: 58.10618\n', 'Test Accuracy is: 57.80233\n', '\n']
247.66923117637634
