In [1]:
import os
import pandas as pd

In [2]:
folder_path = r"C:\Users\dqthi\Downloads\pandas-workout-data\data\ex31"
files = os.listdir(folder_path)

# Initialize list to store all dataframes
all_dfs = []

# Process each file
for one_file in files:
    if one_file.endswith('.csv'):
        # Create full file path
        full_file_path = os.path.join(folder_path, one_file)
        
        # Get city and state from filename
        name_part = one_file.removesuffix('.csv')
        if ',' in name_part:
            city, state = name_part.split(',')
            
            # Read the CSV file
            one_df = (
                pd.read_csv(
                    full_file_path,
                    usecols=[0, 1, 2],
                    names=[ 'max_temp', 'min_temp', 'precipMM'],
                    header=0
                )
                .assign(
                    city=city.replace('+', ' ').title(),
                    state=state.upper()
                )
            )
            
            all_dfs.append(one_df)
            print(f"Processed {city}, {state}")
        else:
            print(f"Warning: Skipping file with invalid name format: {one_file}")

# Combine all dataframes
if all_dfs:
    final_df = pd.concat(all_dfs, ignore_index=True)
    print("\nFinal dataframe shape:", final_df.shape)
    print("\nFirst few rows:")
    print(final_df.head())
else:
    print("No valid files were processed")

Processed albany, ny
Processed boston, ma
Processed chicago, il
Processed los+angeles, ca
Processed new+york, ny
Processed san+francisco, ca
Processed springfield, il
Processed springfield, ma

Final dataframe shape: (5824, 5)

First few rows:
              max_temp  min_temp  precipMM    city state
0  2018-12-11 00:00:00        -2        -8  Albany    NY
1  2018-12-11 03:00:00        -2        -8  Albany    NY
2  2018-12-11 06:00:00        -2        -8  Albany    NY
3  2018-12-11 09:00:00        -2        -8  Albany    NY
4  2018-12-11 12:00:00        -2        -8  Albany    NY


In [3]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5824 entries, 0 to 5823
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   max_temp  5824 non-null   object
 1   min_temp  5824 non-null   int64 
 2   precipMM  5824 non-null   int64 
 3   city      5824 non-null   object
 4   state     5824 non-null   object
dtypes: int64(2), object(3)
memory usage: 227.6+ KB


In [4]:
def has_multiple_readings_at_least(mini_df):
    return mini_df.loc[
        mini_df['precipMM'] >= 15,
        'precipMM'
        ].count() >= 3

In [5]:
(
    final_df
    .groupby(['city', 'state'])
    .filter(has_multiple_readings_at_least)
)

Unnamed: 0,max_temp,min_temp,precipMM,city,state
2184,2018-12-11 00:00:00,20,14,Los Angeles,CA
2185,2018-12-11 03:00:00,20,14,Los Angeles,CA
2186,2018-12-11 06:00:00,20,14,Los Angeles,CA
2187,2018-12-11 09:00:00,20,14,Los Angeles,CA
2188,2018-12-11 12:00:00,20,14,Los Angeles,CA
...,...,...,...,...,...
2907,2019-03-11 09:00:00,19,12,Los Angeles,CA
2908,2019-03-11 12:00:00,19,12,Los Angeles,CA
2909,2019-03-11 15:00:00,19,12,Los Angeles,CA
2910,2019-03-11 18:00:00,19,12,Los Angeles,CA


In [6]:
(
    final_df
    .groupby(['city', 'state'])
    .filter(has_multiple_readings_at_least)
    [['city', 'state']]
    .drop_duplicates()
)

Unnamed: 0,city,state
2184,Los Angeles,CA


In [7]:
def has_multiple_readings_at_least(mini_df, min_mm, times):
    return mini_df.loc[
        mini_df['precipMM'] >= min_mm,
        'precipMM'
        ].count() >= times

In [8]:
(
    final_df
    .groupby(['city', 'state'])
    .filter(has_multiple_readings_at_least,
            min_mm=10,
            times=3)
    [['city', 'state', 'precipMM']]
    .drop_duplicates()
)

Unnamed: 0,city,state,precipMM
2184,Los Angeles,CA,14
2200,Los Angeles,CA,15
2208,Los Angeles,CA,16
2256,Los Angeles,CA,17
2264,Los Angeles,CA,12
...,...,...,...
4768,Springfield,IL,-25
4776,Springfield,IL,-22
4832,Springfield,IL,10
4848,Springfield,IL,-15


In [9]:
def has_multiple_readings_at_least(mini_df, min_mm, times):
    return mini_df.loc[
        ((mini_df['precipMM'] >= min_mm) &
         (mini_df['min_temp'] <= 0)),
        'precipMM'
        ].count() >= times

In [11]:
(
    final_df
    .groupby(['city', 'state'])
    .filter(has_multiple_readings_at_least, min_mm=10, times=3)
    [['city', 'state']]
    .drop_duplicates()
)

Unnamed: 0,city,state


# Redo this exercise to make sure I comprehense the concept 