## Trevor Arellanes 
## Data Analytics Fundamentals
## Module 7 - Task 3

### Importing Libraries

In [32]:
import pandas as pd
import scipy
%matplotlib 

Using matplotlib backend: TkAgg


### Linear Relationships

In [33]:
c = lambda f: 5/9 * (f-32)

temps = [(f, c(f)) for f in range(0, 101, 10)]

In [34]:
temps_df = pd.DataFrame(temps, columns=['Fahrenheit', 'Celsius'])

axes = temps_df.plot(x='Fahrenheit', y='Celsius', style='.-')

y_label = axes.set_ylabel('Celsius')

In [35]:
nyc = pd.read_csv(r'C:\Users\tarellanes\Desktop\Fundamentals of Data Analytics\IntroToPython-master\examples\ch10\ave_hi_nyc_jan_1895-2018.csv')

### First and last 5 Entires for dataframe

In [36]:
nyc.head()

Unnamed: 0,Date,Value,Anomaly
0,189501,34.2,-3.2
1,189601,34.7,-2.7
2,189701,35.5,-1.9
3,189801,39.6,2.2
4,189901,36.4,-1.0


In [37]:
nyc.tail()

Unnamed: 0,Date,Value,Anomaly
119,201401,35.5,-1.9
120,201501,36.1,-1.3
121,201601,40.8,3.4
122,201701,42.8,5.4
123,201801,38.7,1.3


### Cleaning the data - adjusting coulumn titles

In [38]:
nyc.columns = ['Date', 'Temperature', 'Anomaly', ]

nyc.head(3)

Unnamed: 0,Date,Temperature,Anomaly
0,189501,34.2,-3.2
1,189601,34.7,-2.7
2,189701,35.5,-1.9


### Looking at data type and using floordivision function to truncate last 2 digits

In [39]:
nyc.Date.dtype

dtype('int64')

In [40]:
nyc.Date = nyc.Date.floordiv(100)

nyc.head(3)

Unnamed: 0,Date,Temperature,Anomaly
0,1895,34.2,-3.2
1,1896,34.7,-2.7
2,1897,35.5,-1.9


### Getting results to 2 decimal places and viewing basic desctiptive statistics for data set

In [41]:
pd.set_option('display.precision', 2)

nyc.Temperature.describe()

count    124.00
mean      37.60
std        4.54
min       26.10
25%       34.58
50%       37.60
75%       40.60
max       47.60
Name: Temperature, dtype: float64

### Calculating slop, intercept and predicting 2026 temperatures based on results

In [42]:
from scipy import stats

linear_regression = stats.linregress(x=nyc.Date, y=nyc.Temperature)

linear_regression.slope # Calculates slope

0.014771361132966163

In [43]:
linear_regression.intercept # calculates intercept

8.694993233674289

In [44]:
linear_regression.slope * 2026 + linear_regression.intercept

38.62177088906374

### Using seaborn to create line plot of data

In [45]:
import seaborn as sns

sns.set_style('whitegrid')

axes = sns.regplot(x=nyc.Date, y=nyc.Temperature)

axes.set_ylim(10, 70)

(10.0, 70.0)

### 15.4 - Using Sckickit Learn to analyze regression for temps

### Splitting the Data for Training and Testing

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(nyc.Date.values.reshape(-1, 1), nyc.Temperature.values, random_state=11)

In [47]:
X_train.shape

(93, 1)

In [48]:
X_test.shape

(31, 1)

### Training the Model

In [49]:
from sklearn.linear_model import LinearRegression

In [64]:
linear_regression = LinearRegression()

linear_regression.fit(X=X_train, y=y_train)

In [51]:
linear_regression.coef_

array([0.01939167])

In [52]:
linear_regression.intercept_

-0.30779820252656975

### Testing the Model

In [63]:
predicted = linear_regression.predict(X_test)

expected = y_test
for p, e in zip(predicted[::5], expected[::5]):
    print(f'predicted: {p:.2f}, expected: {e:.2f}')

predicted: 37.86, expected: 31.70
predicted: 38.69, expected: 34.80
predicted: 37.00, expected: 39.40
predicted: 37.25, expected: 45.70
predicted: 38.05, expected: 32.30
predicted: 37.64, expected: 33.80
predicted: 36.94, expected: 39.70


In [62]:
predict = (lambda x: linear_regression.coef_ * x +
           linear_regression.intercept_)

predict (2026)

array([38.97973189])

In [55]:
predict(1890)

array([36.34246432])

### Visualising the data set with regression line

In [56]:
axes = sns.scatterplot(data=nyc, x='Date', y='Temperature',
            hue='Temperature', palette='winter', legend=False)

axes.set_ylim(10,70)

(10.0, 70.0)

In [60]:
import numpy as np

x = np.array([min(nyc.Date.values), max(nyc.Date.values)])

In [58]:
y = predict(x)

### Plotting line for regression analysis

In [61]:
import matplotlib.pyplot as plt

line = plt.plot(x,y)

### Differences between time series prediction using linregress and skickit learn
- Using the linregress function from scipy is a simpler method to predict time series data 
- This works best for simple regression 
- Skickit learn offers more flexibility and options when analyzing data
- Skickit learn offers more tools to examine not only regression, but classification, clustering and others
- It has more features for complex data and data problems