<a href="https://colab.research.google.com/github/yajuna/tmath495Sp24/blob/master/CleanData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Pull data sets from GitHub and combine

By Austin B

In [1]:
import pandas as pd
!pip install --user openpyxl

# Reorder tree data by temp with column name as new feature
url1 = "https://raw.githubusercontent.com/yajuna/linearRegression/master/Tree_Temp_Values_AUG21_to_AUG28_2022.xlsx"
tree = pd.read_excel(url1)
tree = tree.melt(id_vars=['Date Time'], var_name='Temperature Source', value_name='Temperature')

# Format the date time column and set as index
tree['Date Time'] = pd.to_datetime(tree['Date Time'], format='%m/%d/%Y %H:%M')
tree.set_index('Date Time', inplace=True)
print(tree)

# Set the date time as a pd.datetime column and to the index
url2 = "https://raw.githubusercontent.com/yajuna/linearRegression/master/Weather_Station_AUG21_to_AUG28_2022.xlsx"
weather = pd.read_excel(url2)

# Set the date_time as a pandas datetime and the index
weather['datetime'] = pd.to_datetime(weather['datetime'])
weather.set_index('datetime', inplace=True)

# Reindex weather to match the tree data frame
weather = weather.reindex(tree.index, method='nearest')

# Combine the data frames
combined = pd.concat([tree, weather], axis=1)
print(combined)

[31mERROR: Can not perform a '--user' install. User site-packages are not visible in this virtualenv.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
                    Temperature Source  Temperature
Date Time                                          
2022-08-21 00:01:51          S4.5cm@1m        25.18
2022-08-21 00:05:07          S4.5cm@1m        25.18
2022-08-21 00:08:23          S4.5cm@1m        25.12
2022-08-21 00:11:38          S4.5cm@1m        25.18
2022-08-21 00:14:54          S4.5cm@1m        25.12
...                                ...          ...
2022-08-28 23:45:31    W_Ext_Temp@3.5m        25.12
2022-08-28 23:48:46    W_Ext_Temp@3.5m        25.12
2022-08-28 23:52:02    W_Ext_Temp@3.5m        25.06
2022-08-28 23:55:17    W_Ext_Temp@3.5m        25.00
2022-

## Clean the data and output to CSV

In [3]:
# Combine the data frames (duplicate to support only running second half of script)
combined = pd.concat([tree, weather], axis=1)

# clean the temperature source column
combined['Temperature Source'] = combined['Temperature Source'].apply(
    lambda x: x.replace('@', ' ')
               .replace('cm', '')
               .replace(',', ' ')
               .replace('m', '')
               )

# Replace S/N/E/W with degrees
directionDict = {'S': 0, 'N': 180, 'E': 270, 'W': 90}
combined['direction'] = combined['Temperature Source'].apply(
    lambda x: directionDict.get(x.split(' ')[0][0])
    ).astype(int)

# Convert the column header to a depth
combined['depth'] = combined['Temperature Source'].apply(
    lambda x: x.split(' ')[0][1:] if not x.split(' ')[0][1:].endswith("_Ext_Tep") else 0
    ).astype(float)

# Convert the column header to a height
combined['height'] = combined['Temperature Source'].apply(
    lambda x: x.split(' ')[1]
    ).astype(float)

# Reorder the columns and drop redundant columns
combined = combined.drop('Temperature Source', axis=1)
cols = combined.columns.tolist()
cols = [cols[0]] + cols[-3:] + cols[1:7]
combined = combined[cols]

# rename columns
combined.index.names = ['date_time']
combined.columns = ['temperature', 'direction', 'depth', 'height', 'wind_speed',
                    'wind_direction', 'air_humidity', 'air_temperature',
                    'air_pressure', 'solar_DNI']

# Create a space column for the three spatial dimensions
combined['space'] = combined[['direction', 'depth', 'height']].apply(list, axis=1)


print(combined)

# Save the cleaned data
combined.to_csv('./data.csv')

# Read the cleaned data
# with open('./data.csv') as f:
#     print(f.read())

                     temperature  direction  depth  height  wind_speed  \
date_time                                                                
2022-08-21 00:01:51        25.18          0    4.5     1.0      2.1758   
2022-08-21 00:05:07        25.18          0    4.5     1.0      2.0381   
2022-08-21 00:08:23        25.12          0    4.5     1.0      2.0381   
2022-08-21 00:11:38        25.18          0    4.5     1.0      2.0381   
2022-08-21 00:14:54        25.12          0    4.5     1.0      2.0381   
...                          ...        ...    ...     ...         ...   
2022-08-28 23:45:31        25.12         90    0.0     3.5      1.3129   
2022-08-28 23:48:46        25.12         90    0.0     3.5      1.3129   
2022-08-28 23:52:02        25.06         90    0.0     3.5      1.3129   
2022-08-28 23:55:17        25.00         90    0.0     3.5      1.3129   
2022-08-28 23:58:32        24.56         90    0.0     3.5      1.3129   

                     wind_direction  