In [None]:
%matplotlib inline
##Importing the necessary packages for the modeling and analysis
import xlrd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

## A method for parsing the data without use of package. Uses a for-loop
#that traverse through all the elements in the data, 
#then rejects all values that have incomplete data or missing records.
#Records these values with the use of a counter, 
#then returns the proportions in which smokers and vapers exist in the data.
#Parameters: the name of the file, the row in which the grade is reflected in
#the data, the row in which smoking is reflected
#in the data, and the row in which vaping is reflected in the data.
#Return: two float values that reflect the proportion of smokers and vapers. 
def getPercentages(filename, gradeIndex, smokeIndex, vapeIndex):
    loc = (filename)
    wb = xlrd.open_workbook(loc)
    sheet = wb.sheet_by_index(0)
    #Instantiate the counters for smokers and vapers.
    smokeCount = 0
    vapeCount = 0

    for row in range(1, sheet.nrows):
        if sheet.row_values(row)[gradeIndex] != '' and \
        sheet.row_values(row)[gradeIndex]!= '*' \
        and int(sheet.row_values(row)[gradeIndex]) >= 4 \
        and int(sheet.row_values(row)[gradeIndex]) <= 7:
            #Counting the number of instances of smoking
            if sheet.row_values(row)[smokeIndex] != ''\
            and sheet.row_values(row)[smokeIndex] != '*'\
            and int(sheet.row_values(row)[smokeIndex]) == 1:
                smokeCount+=1
            #Counting the number of instances of vaping
            if sheet.row_values(row)[vapeIndex] != ''\
            and sheet.row_values(row)[vapeIndex] != '*'\
            and int(sheet.row_values(row)[vapeIndex]) == 1:
                vapeCount+=1
    return float(smokeCount)/sheet.nrows, float(vapeCount)/sheet.nrows
#Instantiate the years that we are interested in, also creates new dataframes
# on which to append.
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017]
smokeArr = []
vapeArr = []
##The following methods must be called individually due to the differing natures
# of their respective dataset. Method call for the year of 2011
print("2011")
smoke, vape = getPercentages("./2011.xlsx", 9, 17, 95)
smokeArr.append(smoke)
vapeArr.append(vape)

#Method call for the year of 2012
print("2012")
smoke, vape = getPercentages("./2012.xlsx", 9, 17, 103)
smokeArr.append(smoke)
vapeArr.append(vape)

#Method call for the year of 2013
print("2013")
smoke, vape = getPercentages("./2013.xlsx", 9, 23, 110)
smokeArr.append(smoke)
vapeArr.append(vape)

#Method call for the year of 2014
print('2014')
smoke, vape = getPercentages("./2014.xlsx", 9, 21, 45)
smokeArr.append(smoke)
vapeArr.append(vape)

#Method call for the year of 2015
print('2015')
smoke, vape = getPercentages("./2015.xlsx", 11, 22, 44)
#print(str(smoke) + " " + str(vape))
smokeArr.append(smoke)
vapeArr.append(vape)

#Method call for the year of 2016
print("2016")
smoke, vape = getPercentages("./2016.xlsx", 12, 24, 43)
smokeArr.append(smoke)
vapeArr.append(vape)

#Method call for the year of 2017
print('2017')
smoke, vape = getPercentages("./2017.xlsx", 13, 25, 46)
smokeArr.append(smoke)
vapeArr.append(vape)


In [None]:
##Using the data found from the previous appending, seeks to find a linear model to
# describe how exactly the data will behave over time. Uses the statistics package
# provided by python to develop a model that fits the points that the graph 
# has added. Uses the plotting package to visualize the actual model.
rangeYears = [2011, 2029]
slope, intercept, r_value, p_value, std_err = stats.linregress(years,smokeArr)
line = float(slope)*np.asarray(years)+intercept
line2 = float(slope)*np.asarray(rangeYears)+intercept
plt.plot(years, smokeArr, 'ro', label='Historical Data for Smoking')
plt.plot(rangeYears, line2, 'r', label='Projections for Smoking')
print("y=" + str(slope) + "x+" + str(intercept))
print("R=" + str(r_value))

##Introduces an array of future arrays that will be plugged into the model for
# future extrapolation.
futureYears = [2017, 2018, 2019, 2020, 2021, 2022, 2023,
               2024, 2025, 2026, 2027, 2028, 2029]
futureSmokeArr = []
for year in futureYears:
    futureSmokeArr.append(slope*year+intercept)

##The same process done for smoking will be now done for vaping, with the data
# from its respective dataset.
slope, intercept, r_value, p_value, std_err = stats.linregress(years,vapeArr)
line = float(slope)*np.asarray(years)+intercept
line2 = float(slope)*np.asarray(rangeYears)+intercept
plt.plot(years, vapeArr, 'bo', label='Historical Data for Vaping')
plt.plot(rangeYears, line2, 'b', label='Projections for Vaping')
print("y=" + str(slope) + "x+" + str(intercept))
print("R=" + str(r_value))
##Uses the future years to predict the forecast.
futureVapeArr = []
for year in futureYears:
    futureVapeArr.append(slope*year+intercept)
##Labels the axes to reflect the units and the data being visualized.
plt.xlabel('Year 2011-2029, $t$ (years)')
plt.ylabel('Proportion of Students Who Use Product')
#plt.title("Years vs Percentage of Cigarette/E-Cigarette Use")
plt.legend()
plt.xlim(2011, 2029)
plt.ylim(-0.1, 0.6)
plt.show()

# Because the previous smoking graph was not completely accurate,
# this code generates exponentially decaying values from 
# the last six years to exhibit a trend that is more in line with
# the realm of possibility. This is a projected model
#and therefore is not completely accurate. The results of this model are
# plotted as well. Additionally, the results from 2018 are also included
# in the model to test the general validity of the model as a means of forecasting.
print(futureSmokeArr)
futureSmokeArr = [0.13007163609597683, 0.11643189698088108,
                  0.10279215786578533, 0.08915241875068958,
                  0.07551267963559027, 0.061872940520494524,
                  0.04823320140539877, 0.03459346229030302,
                  0.024953723175207273, 0.016139840601115225,
                  0.011325755054987781, 0.00926549417008353,
                  0.008860523328517928]
plt.plot(futureYears, futureSmokeArr, '-r', label='Adjusted Projections for Smoking')
plt.plot(futureYears, futureVapeArr, '-b', label='Projections for Vaping')
plt.plot([2018], [0.093], 'yo', label='Smoking 2018 result')
plt.plot([2018], [0.208], 'go', label='Vaping 2018 result')
plt.xlabel('Year 2018-2029, $t$ (years)')
plt.ylabel('Predicted Proportion of Students Who Use Product')
plt.xlim(2017, 2029)
plt.ylim(0, 0.6)
plt.legend()
plt.show()