# Practicals for lecture 1.2

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vigji/python-cimec/blob/main/practicals/Practicals_1.2.ipynb)

#### 1.0.2 

In [2]:
import numpy as np
import requests
import json


def generate_RT_data(n_subjects=200, n_samples_per_subject=1000):
    """
    Generates Reaction Time data for a given number of subjects, each with their own distribution parameters.
    
    Parameters:
    n_subjects (int): Number of subjects
    n_samples_per_subject (int): Number of samples (RT times) per subject
    
    Returns:
    np.ndarray: A 2D array where each row represents the RT times for a subject
    """
    np.random.seed(0)  # For reproducibility
    shift = 0.500  # Shift of the distribution
    # Initialize an empty array to store the RT times for all subjects
    RT_data = np.empty((n_subjects, n_samples_per_subject))
    
    for i in range(n_subjects):
        # Assuming mu ranges from 90 to 110 and sigma from 10 to 20 for the subjects
        mu = np.random.uniform(0.090, 0.110)
        sigma = np.random.uniform(0.10, 0.20)
        RT_data[i] = np.random.normal(mu, sigma, n_samples_per_subject) + shift
    
    return RT_data


def download_meteo_data(start_date="2022-01-01", end_date="2022-12-31",
                        latitude="45.88204", longitude="11.03647",
                        data="temperature_2m"):
    """Download meteo historical data from open-meteo.com.
    """
    BASE_URL = "https://archive-api.open-meteo.com/v1/"
    query = f"archive?latitude={latitude}&longitude={longitude}&start_date={start_date}&end_date={end_date}&hourly={data}"

    r = requests.get(BASE_URL + query)
    json_dict = json.loads(r.text)
    
    if "hourly" not in json_dict.keys():
        print(json_dict)
        return None, None
    else:
        return (np.array(json_dict["hourly"][k]) for k in ["time", data])

In [6]:
# Find the index of the subject with the shortest trial reaction time of the whole dataset 
# (not shortest average!)
# (Hint: you will need two operations...)
rt_data = generate_RT_data(n_subjects=10)
rt_data.shape

(10, 1000)

In [7]:
# Use argmax to find the index of the warmest hour in the (non-reshaped) temperature_array.
# Then, use the index over timestamps_array to read out the corresponding timestamp.
timestamps_array, temperatures_array = download_meteo_data()


In [9]:
# We can use the np.argsort() function to produce the indexes array required to
# order an array in ascending or descending values.

# Let's make a ranking of the 5 warmest hours of 2022! 
# Sort the (non-reshaped) temperature array using the indexes produced by np.argsort.
# so that the first elements are the highest temperatures.
# Then sort the imestamps array with the same indexes, and take the first 5.
#
# Double check you match the result that you have got in the exercises above!

In [None]:
# Build a boolean selector to filter all temperatures above 10 and below 25:



In [10]:
# (Bonus):
# Let's do the same, but only for the months between january and March, and only for hours between 08 and 18.

# To get the condition on month and hours, you will have to parse the timestamp string, or explore
# the timedate library for more elegant solutions!

timestamps_array

array(['2022-01-01T00:00', '2022-01-01T01:00', '2022-01-01T02:00', ...,
       '2022-12-31T21:00', '2022-12-31T22:00', '2022-12-31T23:00'],
      dtype='<U16')

## Introduction to `pandas`

#### 1.2.0 Numpy bool; Create and index dataframes

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
# Numpy bool operations
# Take the array of integer numbers below. Use array boolean operations to filter out the numbers that 
# are greater than 5 AND less than 8, OR that are multiple of 7.
np.random.seed(42)
an_array = np.random.randint(0, 10, 100)

an_array[((an_array > 5) & (an_array < 8)) | (an_array % 7 == 0)]

In [None]:
string_data = ['cabbage', 'artichoke', 'banana', 'avocado', 'apple', 'orange']
int_data = [1, 2, 3, 4, 5, 6]
float_data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]

# Put together the data above in a DataFrame. What happens if you don't specify the index?

df = pd.DataFrame(dict(string_data=string_data, int_data=int_data, float_data=float_data))

In [None]:
# Select the column of the dataframe containing the strings using the name of the column

df["string_data"]

In [None]:
# Select the first two rows of the dataframe
df.loc[:1]

In [None]:
# Select the rows of the dataframe so that the float data is greater than 0.2

df.loc[df["float_data"] > 0.2, "string_data"]

In [None]:
# Select the rows of the dataframe so that the float data is greater than 0.2 and the int data is less than 5
df.loc[(df["float_data"] > 0.2) & (df["int_data"] < 5)]

In [None]:
# use the .loc property to select the value of the float data in the row with index 3
df.loc[3, "float_data"]

In [None]:
# Add a new column to the dataframe containing the following data entries:
new_data = [500, 300, 200, 400, 600, 500]

df["new_col"] = new_data
df


#### 1.2.1 Methods of `pandas` dataframes

In [None]:
# Here we create a fake dataframe containing the results of an psychological test with 30 subjects.
# Subjects can be left-handed or right-handed.
# The test has 2 measures (reaction time - RT, and accuracy)
def create_data_df():
    np.random.seed(42)
    subject_ability = np.random.uniform(0, 1, 30)
    return pd.DataFrame({'subject': [f"subject_{i}" for i in range(30)],
                       'handedness': np.random.choice(['left', 'right'], 30),
                       'RT': subject_ability*100 + np.random.uniform(0, 50, 30),
                       'accuracy': subject_ability + np.random.normal(0.8, 0.1, 30)})
df = create_data_df()

In [None]:
# sort the dataframe by RT:
df.sort_values(by="RT")

In [None]:
# Compute the mean and standard deviation of the RT and accuracy across the dataset:
df[["RT", "accuracy"]].mean()

In [None]:
# Use indexing to select the RT of the left-handed subjects, and compute its 90% percentile:
lefthand_rt_90perc = df.loc[df["handedness"] == "left", "RT"].quantile(0.9)

In [None]:
# Now use the percentile to select the accuracy for left-handed subjects with RT above the 90% percentile:
lefthanded_df = df[df["handedness"] == "left"]
lefthanded_df.loc[lefthanded_df["RT"] > lefthand_rt_90perc, "accuracy"]

In [None]:
# Create a scatter plot of RT vs accuracy for the right-handed subjects:
righthanded_df = df[df["handedness"] == "right"]
righthanded_df.plot(kind="scatter", x="RT", y="accuracy")

In [None]:
# Take the meteo dataset using the function below
def get_meteo_dataset():
    """Get the meteo dataset from the open-meteo API.
    Note how easy it is to get data from the web with pandas! As long as we give the URL of the csv data, pandas can read it.
    """
    np.random.seed(42)
    URL = "https://api.open-meteo.com/v1/forecast?latitude=52.52&longitude=13.41&hourly=temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,winddirection_10m&start_date=2023-02-01&end_date=2023-05-28&format=csv"
    df = pd.read_csv(URL, skiprows=3)  # read the csv file, skipping the first 3 rows (a header)
    df.columns = [col.split(" ")[0] for col in df.columns]  # simplify column names
    df["time"] = pd.to_datetime(df["time"])  # convert the time column to datetime
    df["hour"], df["dayofyear"] = df["time"].dt.hour, df["time"].dt.dayofyear  # extract the hour and day of year
    df["weekdays"] = df["time"].dt.day_name()  # extract the day of the week

    # Here we artificially corrupt some of the data to make it more interesting
    missing_idx = np.random.choice(df.index[:1000], 100)
    df.loc[missing_idx, :] = np.nan
    return df

meteo_df = get_meteo_dataset()

In [None]:
meteo_df 

In [None]:
# Plot the temperature and relative humidity for the first 1000 time points. 
# Find points where there's missing data (interrupted line).
meteo_df.loc[:1000, "temperature_2m"].plot()

In [None]:
# Create a new interpolated temperature column by interpolating the temperature column of the dataframe:
meteo_df["temperature_2m_interp"] = meteo_df["temperature_2m"].interpolate()

In [None]:
# Make a new plot with the interpolated temperature and the non-interpolated temperature 
# (plot the interpolated first!)
meteo_df.loc[:1000, "temperature_2m_interp"].plot()
meteo_df.loc[:1000, "temperature_2m"].plot()