# Practicals for lecture 1.2

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vigji/python-cimec-2024/blob/main/practicals/Practicals_1.2.ipynb)

#### 1.2.0 

In [2]:
import numpy as np
import requests
import json


def generate_RT_data(n_subjects=200, n_samples_per_subject=1000):
    """
    Generates Reaction Time data for a given number of subjects, each with their own distribution parameters.
    
    Parameters:
    n_subjects (int): Number of subjects
    n_samples_per_subject (int): Number of samples (RT times) per subject
    
    Returns:
    np.ndarray: A 2D array where each row represents the RT times for a subject
    """
    np.random.seed(0)  # For reproducibility
    shift = 0.500  # Shift of the distribution
    # Initialize an empty array to store the RT times for all subjects
    RT_data = np.empty((n_subjects, n_samples_per_subject))
    
    for i in range(n_subjects):
        # Assuming mu ranges from 90 to 110 and sigma from 10 to 20 for the subjects
        mu = np.random.uniform(0.090, 0.110)
        sigma = np.random.uniform(0.10, 0.20)
        RT_data[i] = np.random.normal(mu, sigma, n_samples_per_subject) + shift
    
    return RT_data


def download_meteo_data(start_date="2022-01-01", end_date="2022-12-31",
                        latitude="45.88204", longitude="11.03647",
                        data="temperature_2m"):
    """Download meteo historical data from open-meteo.com.
    """
    BASE_URL = "https://archive-api.open-meteo.com/v1/"
    query = f"archive?latitude={latitude}&longitude={longitude}&start_date={start_date}&end_date={end_date}&hourly={data}"

    r = requests.get(BASE_URL + query)
    json_dict = json.loads(r.text)
    
    if "hourly" not in json_dict.keys():
        print(json_dict)
        return None, None
    else:
        return (np.array(json_dict["hourly"][k]) for k in ["time", data])

In [6]:
# Find the index of the subject with the shortest trial reaction time of the whole dataset 
# (not shortest average!)
# (Hint: you will need two operations...)
# (You can read a second hint scrolling right in the cell:                                                                                               : you have to do one max and one argmax...)

rt_data = generate_RT_data(n_subjects=10)
rt_data.shape

(10, 1000)

In [7]:
# Use argmax to find the index of the warmest hour in the (non-reshaped) temperature_array.
# Then, use the index over timestamps_array to read out the corresponding timestamp.
timestamps_array, temperatures_array = download_meteo_data()


In [9]:
# We can use the np.argsort() function to produce the indexes array required to
# order an array in ascending or descending values.

# Let's make a ranking of the 5 warmest hours of 2022! 
# Sort the (non-reshaped) temperature array using the indexes produced by np.argsort.
# so that the first elements are the highest temperatures.
# Then sort the timestamps array with the same indexes, and take the first 5.


In [None]:
# Build a boolean selector to filter all temperatures above 10 and below 25 degrees:


In [10]:
# (Bonus):
# Let's do the same, but only for the months between january and March, and only for hours between 08 and 18.

# To get the condition on month and hours, you will have to parse the timestamp string, or explore
# the timedate library for more elegant solutions!

timestamps_array

array(['2022-01-01T00:00', '2022-01-01T01:00', '2022-01-01T02:00', ...,
       '2022-12-31T21:00', '2022-12-31T22:00', '2022-12-31T23:00'],
      dtype='<U16')

In [None]:
# (Bonus) Take the array of integer numbers below. Use array boolean operations to filter out the numbers that 
# are greater than 5 AND less than 8, OR that are multiple of 7.
np.random.seed(42)
an_array = np.random.randint(0, 10, 100)

an_array[((an_array > 5) & (an_array < 8)) | (an_array % 7 == 0)]

## Introduction to `pandas`

#### 1.2.1 DataFrames

In [11]:
import pandas as pd

In [24]:
# Consider the following dataset with info about subject in an experiment:
np.random.seed(42)
n_subjects = 100
subjects_df = pd.DataFrame({
    'age': np.random.randint(20, 40, n_subjects),
    'weight': np.random.randint(50, 100, n_subjects),
    'height': np.random.randint(150, 200, n_subjects),
    'sex': np.random.choice(['M', 'F'], n_subjects),
    'handedness': np.random.choice(['R', 'L'], n_subjects),
    'group': np.random.choice(['control', 'patient'], n_subjects)})

subjects_df.head()

Unnamed: 0,age,weight,height,sex,handedness,group
0,26,84,175,M,L,patient
1,39,93,181,M,R,patient
2,34,89,155,F,L,patient
3,30,71,181,M,R,patient
4,27,76,153,F,L,control


In [25]:
# Select the first two rows of the dataframe:


In [None]:
# Select the column of the dataframe containing the subjects weight using the name of the column:


In [None]:
# Select the `weight` column of the dataframe filtering only rows of subjects > 34 years old:


In [None]:
# Count how many males and how many females above age 30 are left-handed or right handed.
# (Hint: use the same element-wise operators we were using for numpy arrays)


In [None]:
# Use .iloc to select one every two rows for the first 3 columns:


In [None]:
# Redefine the dataset index to be "subject_n_sex" 
# (where n is progressive number of the subject and sex the sex of the subject)



#### 1.2.2

In [None]:
# We can calculate the BMI as :
# (body weight in kgs) /  ( (heigth in meters) ** 2) 

# Define a new column where you compute the BMI for every subject:



In [None]:
string_data = ['cabbage', 'artichoke', 'banana', 'avocado', 'apple', 'orange']
int_data = [1, 2, 3, 4, 5, 6]
float_data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]

# organize the data above in a dictionary and from it define a new DataFrame:


In [26]:
# Now, organize the data in a list of dictionaries and from it define a new DataFrame:


In [None]:
# Extend the subject dataset above adding two subjects for which you have the following info:
# (Hint: start by defining a new dataframe to concatenate, and then concatenate)

new_subject1 = {'age': 25,
    'weight': 78,
    'height': 170,
    'sex': "M",
    'handedness': "R",
    'group': "patient"}
new_subject2 = {'age': 35,
    'weight': 65,
    'height': 165,
    'sex': "F",
    'handedness': "R",
    'group': "control"}

# Make sure the new dataset has univoque indexing! 