# Practicals for lecture 1.2

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vigji/python-cimec-2025/blob/main/practicals/Practicals_1.2.ipynb)

#### 1.2.0 

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [7]:
def generate_RT_data(n_subjects=200, n_samples_per_subject=1000):
    """Generates Reaction Time data for a given number of subjects"""
    np.random.seed(0)  # For reproducibility
    shift = 0.500  # Shift of the distribution
    # Initialize an empty array to store the RT times for all subjects
    RT_data = np.empty((n_subjects, n_samples_per_subject))
    
    for i in range(n_subjects):
        # Assuming mu ranges from 90 to 110 and sigma from 10 to 20 for the subjects
        mu = np.random.uniform(0.090, 0.110)
        sigma = np.random.uniform(0.10, 0.20)
        RT_data[i] = np.random.normal(mu, sigma, n_samples_per_subject) + shift
    
    return RT_data

# This is our data matrix (200 subjects, 1000 RTs each)
rt_data = generate_RT_data()

# Assume that w consider outliers the RTs longer than 0.7 seconds. 
# Compute again the mean RTs after having excluded such outliers (make sure you use only vector operations)

# (Hint: an easy way to exclude outliers without using loops is by setting nans in the matrix)
outliers_indexes_tuple = np.nonzero(rt_data > 0.7)
print(outliers_indexes_tuple)
rt_data[outliers_indexes_tuple] = np.nan
mean_no_outliers = np.nanmean(rt_data)
mean_no_outliers


(array([  0,   0,   0, ..., 199, 199, 199]), array([  0,   1,   2, ..., 979, 981, 984]))


0.5374604871607751

In [11]:
# Find the shortest RT of the whole dataset; then, find its index to know the subject and the trial number:
min_val = np.nanmin(rt_data)
np.argwhere(rt_data == min_val)


array([[ 28, 610]])

In [13]:
min_idx = np.nanargmin(rt_data)
np.unravel_index(min_idx, rt_data.shape)

(28, 610)

In [16]:
def download_meteo_data(start_date="2022-01-01", end_date="2022-12-31",
                        latitude="45.88204", longitude="11.03647",
                        data="temperature_2m"):
    """Download meteo historical data from open-meteo.com."""

    import requests
    import json

    BASE_URL = "https://archive-api.open-meteo.com/v1/"
    query = f"archive?latitude={latitude}&longitude={longitude}&start_date={start_date}&end_date={end_date}&hourly={data}"

    r = requests.get(BASE_URL + query)
    json_dict = json.loads(r.text)
    
    if "hourly" not in json_dict.keys():
        print(json_dict)
        return None, None
    else:
        return np.array(json_dict["hourly"]["time"]).reshape(-1, 24), np.array(json_dict["hourly"][data]).reshape(-1, 24)

# Find the index for the highest temperature in the temperature_array.
# Then, apply the index you have found over timestamps_array to read out the corresponding timestamp(s).

# (Hint:
# Remember, you have to either: 
#  - use np.argmax, and to work with the flattened() arrays when using the index you found;
#  - find maximum value using np.max, and then use np.nonzero to get the indexes of values 
#    equal to the max over both dimensions of the array)

timestamps_array, temperatures_array = download_meteo_data()

max_t_idx = np.argmax(temperatures_array)
timestamps_array.flatten()[max_t_idx]

'2022-07-22T13:00'

In [23]:
# The following code gives you a matrix of zeros and ones:
import numpy as np
np.random.seed(42)
data = np.random.randint(0, 2, size=(5, 6))

# Find the rows and columns of all the ones in the matrix:
ones_idxs = np.nonzero(data)

In [27]:
# Then, replace them with nans using the indexes you found; 
# can you do it without changing the dtype of the array?

data_float = data.astype(float)
data_float[ones_idxs] = np.nan
data_float

array([[ 0., nan,  0.,  0.,  0., nan],
       [ 0.,  0.,  0., nan,  0.,  0.],
       [ 0.,  0., nan,  0., nan, nan],
       [nan,  0., nan,  0., nan, nan],
       [nan, nan, nan, nan, nan, nan]])

In [7]:
# This array represents a stupid example signal with some noise.
# Without using loops, find the indexes of the peaks in the signal.
# Peaks are defined as consecutive numbers in the array that are larger than the numbers immediately before and after them.

signal = np.array([1, 3, 7, 1, 2, 6, 0, 1])

# Create shifted versions of the array for comparison
left = signal[:-2]  # all elements except last two
middle = signal[1:-1]  # all elements except first and last
right = signal[2:]  # all elements except first two

# Find where middle is greater than both neighbors
peak_indices = np.argwhere((middle > left) & (middle > right))[:, 0] + 1

print("Peak indices:", peak_indices)
print("Peak values:", signal[peak_indices])

Peak indices: [2 5]
Peak values: [7 6]


#### Practicals 1.2.1

In [35]:
# Now compute the 1D array of average temperatures per day.
# Build a boolean selector to filter all mean temperatures above 10 and below 25 degrees:
daily_means = np.mean(temperatures_array, axis=1)
vals = daily_means[(daily_means > 10) & (daily_means < 25)]
idxs = np.argwhere((daily_means > 10) & (daily_means < 25))


In [38]:
# For the array of integer numbers below, use array boolean operations to filter out the numbers that 
# are greater than 5 AND less than 8, OR that are multiple of 7.
# USe the selector to create a new array with the included numbers.
np.random.seed(42)
an_array = np.random.randint(0, 10, 100)

selector = ((an_array > 5) & (an_array < 8)) | ((an_array % 7) == 0)

an_array[selector]


array([6, 7, 6, 6, 7, 7, 7, 7, 0, 0, 6, 6, 6, 6, 7, 0, 7, 7, 6, 7, 7, 0,
       6, 7, 0, 7, 7, 0, 7, 0, 6, 6, 7])

In [52]:
# We can use the np.argsort() function to produce the indexes array required to
# order an array in ascending or descending values.

# For example:
random_arr = np.array([0.1, 5, 3.4, 2.3])
ordering_idxs = np.argsort(random_arr)
random_arr[ordering_idxs]  # with this index, this is now ordered!

# Let's make a ranking of the 5 warmest hours of the dataset!
# Sort the (flattened) temperature array using the indexes produced by np.argsort.
# so that the first elements are the highest temperatures.
# Then sort the timestamps array with the same indexes, and take the first 5.
descending_ordered_idxs = np.argsort(-temperatures_array.flatten())
# Double check you match the result that you have got in the exercises above!

timestamps_array.flatten()[descending_ordered_idxs][:5]

array(['2022-07-22T13:00', '2022-07-22T12:00', '2022-07-26T12:00',
       '2022-07-25T13:00', '2022-07-22T11:00'], dtype='<U16')

#### 1.2.2 Introduction to `pandas` DataFrames

In [45]:
import pandas as pd

In [9]:
# Consider the following dataset with info about subject in an experiment:
np.random.seed(42)
n_subjects = 100
subjects_df = pd.DataFrame({
    'age': np.random.randint(20, 40, n_subjects),
    'weight': np.random.randint(50, 100, n_subjects),
    'height': np.random.randint(150, 200, n_subjects),
    'sex': np.random.choice(['M', 'F'], n_subjects),
    'handedness': np.random.choice(['R', 'L'], n_subjects),
    'group': np.random.choice(['control', 'patient'], n_subjects)})

subjects_df.head()

Unnamed: 0,age,weight,height,sex,handedness,group
0,26,84,175,M,L,patient
1,39,93,181,M,R,patient
2,34,89,155,F,L,patient
3,30,71,181,M,R,patient
4,27,76,153,F,L,control


In [10]:
# Select the first two rows of the dataframe:
subjects_df.loc[:1, :]


Unnamed: 0,age,weight,height,sex,handedness,group
0,26,84,175,M,L,patient
1,39,93,181,M,R,patient


In [11]:
# Select the column of the dataframe containing the subjects weight using the name of the column:
subjects_df.loc[:, "weight"]


0     84
1     93
2     89
3     71
4     76
      ..
95    82
96    50
97    68
98    51
99    93
Name: weight, Length: 100, dtype: int64

In [12]:
# Select the `weight` column of the dataframe filtering only rows of subjects > 34 years old:
subjects_df.loc[subjects_df["age"] > 34, "weight"]


1     93
6     50
19    64
21    75
24    81
26    98
29    79
33    94
36    78
39    81
47    77
49    93
56    88
62    52
67    58
69    82
82    51
83    52
98    51
Name: weight, dtype: int64

In [13]:
# Count how many males and how many females above age 30 are left-handed or right handed.
# (Hint: use the same element-wise operators we were using for numpy arrays)

for sex in ["M", "F"]:
    for handedness in ["R", "L"]:
        print(f"Number of {sex} {handedness} handed subjects above 30: {len(subjects_df[(subjects_df['age'] > 30) & (subjects_df['sex'] == sex) & (subjects_df['handedness'] == handedness)])}")


Number of M R handed subjects above 30: 11
Number of M L handed subjects above 30: 13
Number of F R handed subjects above 30: 7
Number of F L handed subjects above 30: 11


In [14]:
# Use .iloc to select one every two rows for the first 3 columns:
subjects_df.iloc[::2, :3]


Unnamed: 0,age,weight,height
0,26,84,175
2,34,89,155
4,27,76,153
6,38,50,166
8,30,86,173
10,27,63,183
12,21,50,171
14,25,75,197
16,20,88,182
18,31,58,155
