In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import ease
import unittest
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../Arranged_Data/final_weater.csv')[['State', 'TotalMonthlyPrecip', 'TempSummer','TempWinter', 'Avgwindspeed']]
df = df[df.State != 'DC']
train,test=train_test_split(df,test_size=0.2)
print(train.head())
print(test.head())

    State  TotalMonthlyPrecip  TempSummer  TempWinter  Avgwindspeed
592    ND            1.452679   70.516667    4.042857     10.203509
500    MN            2.262228   67.575139   15.602083      8.067204
468    MI            1.943412   72.859259   27.922222      8.733861
273    ID            1.009545   70.900000   30.025000      7.152174
868    SD            1.205357   64.131250   18.618750     10.639894
    State  TotalMonthlyPrecip  TempSummer  TempWinter  Avgwindspeed
899    TN            4.885984   75.754074   42.853333      5.230769
731    NY            3.028965   69.900748   29.807527      7.442319
342    KS            1.766017   81.068182   34.290909     10.243846
542    MS            4.655549   79.462607   50.030983      5.554491
474    MI            2.633300   69.240741   20.474074      8.018966


In [3]:
prec = test.iloc[:,1]

In [4]:
# The function involves four input parameters then returns a result of vote
def rf(prec, ts, tw, ws):
    """
    This function is RandomForest classifier to intake user input temperature,
    precipitation, and windespeed to classify the possible states the user
    will be in, and returns a dictionary with states as the keys and
    the frequency or count as the values of each key.
       input = user input values, integer or float based.
       vote  = dictionary based output that contains the RF classified states,
               and each states frequency.
    """
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    import itertools as it
    import warnings
    
    # read arranged weatehr data, drop elements contained in state DC, and remove warnings
    train = pd.read_csv('../Arranged_Data/final_weater.csv')[[
            'State', 'TotalMonthlyPrecip', 'TempSummer',
            'TempWinter', 'Avgwindspeed']]
    train = train[train.State != 'DC']
    warnings.filterwarnings('ignore')
    
    # set precipitation, summer temperature, winter temperature, and windspeed as input parameters
    # set total number of trees equal to 500
    # creat a list for saving prediction results and a dictonary for vote outcome
    input_ = [prec, ts, tw, ws]
    tree_num = 5000
    pred_list = []
    vote = {}
    rf = RandomForestClassifier(n_estimators = tree_num)
    rf.fit(train.iloc[:,1:5], train.State)

    for i in range(tree_num):
        dt = rf.estimators_[i]
        dt.fit(train.iloc[:,1:5], train.State)
        pred = dt.predict(input_)
        pred_list.append(pred[0])

    pred_key = []
    pred_key_count = []
    for key, group in it.groupby(sorted(pred_list)):
        pred_key.append(key)
        pred_key_count.append(len(list(group)))
    
    # the dictionary of vote contains states as keys and weight percent as values
    for i in range(len(pred_key)):
        vote[pred_key[i]] = pred_key_count[i]/tree_num
    return vote

In [5]:
# The function returns the average capacity of resources by multiple their average plant capacity with vote results
def avg_capacity(vote):
    """
    creat a null list
    set original resources sum euqal to zero
    for state in vote keys
        average capacity of resource = average plant capacity of particular resource in specific state * vote results specific state
    return average capacity
    """
    average_plant_capacity = pd.read_csv('../Arranged_Data/average_plant_capacity.csv')
    avg_cap_list = []
    coal_sum = 0
    ng_sum = 0
    petro_sum = 0
    hydro_sum = 0
    solar_sum = 0
    wind_sum =0
    for i in vote.keys():
        coal_sum += int(average_plant_capacity.Coal[average_plant_capacity.State == i]) * vote[i]
        ng_sum += int(average_plant_capacity.NG[average_plant_capacity.State == i]) * vote[i]
        petro_sum += int(average_plant_capacity.Petro[average_plant_capacity.State == i]) * vote[i]
        hydro_sum += int(average_plant_capacity.Hydro[average_plant_capacity.State == i]) * vote[i]
        solar_sum += int(average_plant_capacity.Solar[average_plant_capacity.State == i]) * vote[i]
        wind_sum += int(average_plant_capacity.Wind[average_plant_capacity.State == i]) * vote[i]
    return([coal_sum, ng_sum, petro_sum, hydro_sum, solar_sum, wind_sum])

In [6]:
# write up a function to select possible type of resources
def possible_type(avg_cap_list):
    """
    creat a empty list to store results of possible type
    for some type in all types
        calculat p value and set up confidence level as 95%
            if p less than alpha, pass
            else, append possible type to empty list
    return possible type 
    """
    cap_pop = pd.read_csv('../Arranged_Data/average_plant_capacity.csv')
    e_type = ['Coal', 'NG', 'Petro', 'Hydro', 'Solar', 'Wind']
    possible_type_list = []
    for i in range(len(e_type)):
        p_value = stats.ttest_1samp(cap_pop[cap_pop[e_type[i]] != 0][e_type[i]], avg_cap_list[i])[1]
        alpha = 0.05  # confidence level
        if avg_cap_list[i] < cap_pop[cap_pop[e_type[i]] !=0][e_type[i]].mean():
            if p_value < alpha:
                pass
            else:
                p_value = -(1 - p_value)
                possible_type_list.append([p_value, avg_cap_list[i], e_type[i]])
        else:
            p_value = (1 - p_value)
            possible_type_list.append([p_value, avg_cap_list[i], e_type[i]])
    return possible_type_list

In [7]:
possible_type([3427272.05, 601160.0700000001, 132529.37000000002, 159063.21, 2253.23, 149414.79999999996])

[[1.0, 3427272.05, 'Coal'],
 [0.9999957284484875, 601160.0700000001, 'NG'],
 [0.99999999999991029, 132529.37000000002, 'Petro'],
 [0.71925414999001624, 159063.21, 'Hydro'],
 [-0.94810332639059414, 2253.23, 'Solar'],
 [0.99999976119956313, 149414.79999999996, 'Wind']]

In [8]:
def clean_or_conv(possible_type_list):
    """
    initialize a null list to store conventional resource
    initialize a null lsit to store clean resource
    for some type in all types
        if type equals coal, natural gas, or petroleum, append it to conventional lsit
        elseif type equals hydro, soalr, or wind, append it to clean list
    retun conventional list, clean list
    """
    clean_list = []
    conventional_list = []
    for i in possible_type_list:
        if i[2] == 'Coal' or i[2] == 'NG' or i[2] == 'Petro':
            conventional_list.append(i)
        elif i[2] == 'Hydro' or i[2] == 'Solar' or i[2] == 'Wind':
            clean_list.append(i)
    return conventional_list, clean_list

In [21]:
conv, clean = clean_or_conv([[0.95810457341493549, 3427272.05, 'Coal'],
 [-0.8923811497455455, 601160.0700000001, 'NG'],
 [0.99926594657247059, 132529.37000000002, 'Petro'],
 [-0.90421357886564524, 159063.21, 'Hydro'],
 [-0.83811357367028649, 149414.79999999996, 'Wind']])
conv

[[0.9581045734149355, 3427272.05, 'Coal'],
 [-0.8923811497455455, 601160.0700000001, 'NG'],
 [0.9992659465724706, 132529.37000000002, 'Petro']]

In [57]:
class TEST_clean_or_conv(unittest.TestCase):
    def test_clean_or_conv(self):
        prec = test.iloc[:,1]
        ts = test.iloc[:,2]
        tw = test.iloc[:,3]
        ws = test.iloc[:,4]
        vote = ease.rf(prec, ts, tw, ws)
        avg_capacity = ease.avg_capacity(vote)
        possible_type= ease.possible_type(avg_capacity)
        conv, clean = ease.clean_or_conv(possible_type_list)
 
        # test the type of output dataset
        self.assertIsInstance(conv,list)
        # test the length of clean list
        self.assertLesser(len(clean),4)
        # test the element type inside of clean list
        self.assertIsInstance(clean[1][2],str)
        # test if clean source exist in conventional list.
        self.assertEqual(clean[0][2],'Coal')
        
class TEST_avg_cost(unittest.TestCase):
    def test_avg_cost(self):
        prec = test.iloc[:,1]
        ts = test.iloc[:,2]
        tw = test.iloc[:,3]
        ws = test.iloc[:,4]
        vote = ease.rf(prec, ts, tw, ws)
        avg_capacity = ease.avg_capacity(vote)
        possible_type= ease.possible_type(avg_capacity)
        conv, clean = ease.clean_or_conv(possible_type_list)
        
if __name__ == '__main__':
    unittest.main()

E
ERROR: C:\Users\yongquan\AppData\Roaming\jupyter\runtime\kernel-bfb1ee37-fc3c-47f0-8529-556c7109766b (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute 'C:\Users\yongquan\AppData\Roaming\jupyter\runtime\kernel-bfb1ee37-fc3c-47f0-8529-556c7109766b'

----------------------------------------------------------------------
Ran 1 test in 0.004s

FAILED (errors=1)


SystemExit: True

ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 210 

In [None]:
vote