In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
%matplotlib inline

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [2]:
dev_train_df = pd.read_excel('/Users/wesleytatum/Desktop/py-conjugated/data/normed_OPV_device.xlsx')
train_df = pd.read_excel('/Users/wesleytatum/Desktop/py-conjugated/data/normed_OPV_train.xlsx')
test_df = pd.read_excel('/Users/wesleytatum/Desktop/py-conjugated/data/normed_OPV_test.xlsx')

dev_only_x = dev_train_df[['Time (min)', 'Temp (C)']]
dev_only_y = dev_train_df[['PCE', 'VocL', 'Jsc', 'FF']]
x_train = train_df[['Anneal_time', 'Anneal_temp', 'MajorAL_avg', 'MinorAL_avg',
                       'Ecc_avg', 'Orient_avg', 'Perim_avg']]
y_train = train_df[['PCE', 'VocL', 'Jsc', 'FF']]
x_test = test_df[['Anneal_time', 'Anneal_temp', 'MajorAL_avg', 'MinorAL_avg',
                       'Ecc_avg', 'Orient_avg', 'Perim_avg']]
y_test = test_df[['PCE', 'VocL', 'Jsc', 'FF']]

After loading in the data and splitting it into our training and testing datasets, it's time for some curve fitting and hyper-parameter optimization. This notebook optimizes the Random Forest fitting on the above datasets

First, we need to define some functions to train, optimize, and compare the Random Forest regression results for predicting the 4 different OPV device parameters, PCE, V$_{OC}$, J$_{SC}$, and FF.

In [3]:
def round_up(n, decimals=1):
    multiplier = 10 ** decimals
    return math.ceil(n * multiplier) / multiplier


def round_down(n, decimals=1):
    multiplier = 10 ** decimals
    return math.floor(n * multiplier) / multiplier


def plot_parity(labels, predictions):
    """
    This function calculates and plots the correlation values of labels and predictions.
    An $R^{2}$ coefficient is calculated from these and annotates the plot.
    """
    
    upper = round_up(max(max(labels), max(predictions)))
    lower = round_down(min(min(labels), min(predictions)))
    
    xlin = ylin = np.arange(upper, lower, 0.1)

    r2 = r2_score(labels, predictions)
    fig, ax = plt.subplots(figsize = (8,6))
    plt.scatter(labels, predictions)
    plt.plot(xlin, ylin, c = 'k')
    ax.annotate(f"$R^{2}$ = {r2:.3f}", xy = (0.2, 0.4))
    ax.set_xlim(lower, upper)
    ax.set_ylim(lower, upper)
    ax.set_xlabel("Predictions")
    ax.set_ylabel("Ground Truth")
    
    return fig, ax


def mean_absolute_accuracy(labels, predictions):
    """
    This is a function to calculate the % accuracy of a batch of labels and 
    predictions made by a model.
    """
    
    accuracies = []
    for x, y in zip(predictions, labels):
        accuracy = np.abs(((y - x) / y))
        accuracies.append(accuracy)
        
    mean_accuracy = sum(accuracies)/len(accuracies)
        
    return mean_accuracy



In [None]:
train_errors = []
test_errors = []

for i, d in enumerate(depths):
    tree = DecisionTreeRegressor(max_depth = d)
    tree.fit(x_train, y_train)
#     print (tree.n_features_)
#     print (tree.n_outputs_)

    y_train_pred = tree.predict(x_train)
    y_test_pred = tree.predict(x_test)


    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    train_errors.append(train_mse)
    test_errors.append(test_mse)