<a href="https://colab.research.google.com/github/vuhpham94/nflx-data-projects-g6/blob/vp-dev/project2/notebooks/project2_g6_elnino.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# drive connection
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.1'
spark_version = 'spark-3.0.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
 #import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType

# we are going to use this to time our queries.
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [None]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/vuhpham94/nflx-data-projects-g6/main/project2/resources/dataset/elnino.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("elnino.csv"), sep=",", header=True)

In [None]:
df.summary().show()

In [None]:
df.printSchema()

In [None]:
pandas_df = df.toPandas()

In [None]:
pandas_df.info()

In [None]:
pandas_df.columns = ["Observation", "Year", "Month", "Day", "Date", "Latitude", "Longitude", "Zonal Winds", "Meridional Winds", "Humidity", "Air Temp", "Sea Surface Temp"]
clean_df = pandas_df.replace('.','NaN')
clean_df = clean_df.drop(columns=['Date'])
clean_df = clean_df.astype({'Observation':'int', 'Year':'int', 'Month':'int', 'Day':'int', 'Latitude':'float', 'Longitude':'float', 'Zonal Winds':'float', 'Meridional Winds':'float', 'Humidity':'float', 'Air Temp':'float', 'Sea Surface Temp':'float'})

In [None]:
clean_df.info()

In [None]:
# Dependencies
from sklearn import datasets
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px


Linear Regression with drop NA values

In [None]:
dropna_df = clean_df.dropna()
dropna_df.info()

In [None]:
select_dropna_df = dropna_df[['Latitude', 'Longitude', 'Zonal Winds', 'Meridional Winds', 'Humidity', 'Air Temp','Sea Surface Temp']]

In [None]:
# independent X variable, and dependent y variable.
X = select_dropna_df.drop(columns=['Air Temp','Sea Surface Temp'])
# X = dropna_df[['Air Temp']]
y = select_dropna_df['Sea Surface Temp']
# y = dropna_df[]

In [None]:
# independent X variable, and dependent y variable.
X = select_dropna_df.drop(columns=['Humidity'])
# X = dropna_df[['Air Temp']]
y = select_dropna_df['Humidity']

In [None]:
# Create our Validation training and testing datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Create the Linear Regression model object
model = LinearRegression()

# Train the model using the training sets
model.fit(X_train, y_train)

print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

# Make predictions using the testing dataset
y_pred = model.predict(X_test)
# Score the model with the testing dataset
model.score(X_test, y_test)

LinearRegression with filling predicted data

In [None]:
clean1_df = clean_df.copy()
drop_train = False

In [None]:
fill_values = {'Zonal Winds': clean1_df['Zonal Winds'].mean(),\
               'Meridional Winds':clean1_df['Meridional Winds'].mean(),\
               'Humidity':clean1_df['Humidity'].mean(),\
               'Air Temp':clean1_df['Air Temp'].mean(),\
               'Sea Surface Temp':clean1_df['Sea Surface Temp'].mean()}

fill_cols = ['Zonal Winds', 'Meridional Winds', 'Humidity', 'Air Temp', 'Sea Surface Temp']
# cols_df = clean1_df[['Latitude', 'Longitude', 'Zonal Winds', 'Meridional Winds', 'Humidity', 'Air Temp', 'Sea Surface Temp']]
cols_df = clean1_df.drop(columns=['Observation'])
for col in fill_cols:
    X1 = cols_df.drop(columns=[col])
    y1 = cols_df[col]
    y1_test = y1[y1.isnull()]
    X1_test = X1.loc[y1_test.index, :]
    y1_train = y1[~y1.isnull()]
    X1_train = X1.loc[y1_train.index, :]
    
    # dropna
    if drop_train:
        X1_train_tmp = X1_train.dropna()
        y1_train_tmp = y1_train.loc[X1_train_tmp.index]
    else:
    # fillna
        X1_train_tmp = X1_train.fillna(value=fill_values)
        y1_train_tmp = y1_train

    # Create the Linear Regression model object
    model = LinearRegression()
    # Train the model using the training sets
    model.fit(X1_train_tmp, y1_train_tmp)
    # Make predictions using the testing dataset
    X1_test = X1_test.fillna(value=fill_values)
    y_pred = model.predict(X1_test)
    y1_test = pd.Series(y_pred, index=X1_test.index)
    clean1_df[col] = pd.concat([y1_train, y1_test])

In [None]:
clean1_df.info()

In [None]:
X = clean1_df.drop(columns=['Air Temp', 'Sea Surface Temp'])
y = clean1_df['Sea Surface Temp']
# Create our Validation training and testing datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# Create the Linear Regression model object
model = LinearRegression()
# Train the model using the training sets
model.fit(X_train, y_train)
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)
# Make predictions using the testing dataset
y_pred = model.predict(X_test)

# Score the model with the testing dataset
model.score(X_test, y_test)

In [None]:
X = clean1_df.drop(columns=['Humidity'])
y = clean1_df['Humidity']
# Create our Validation training and testing datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# Create the Linear Regression model object
model = LinearRegression()
# Train the model using the training sets
model.fit(X_train, y_train)
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)
# Make predictions using the testing dataset
y_pred = model.predict(X_test)

# Score the model with the testing dataset
model.score(X_test, y_test)