In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv('seattle-weather.csv')

# Display the first few rows of the dataset
df.head()


Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [5]:
# Check for missing values
print(df.isnull().sum())


date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64


In [6]:
# Handle missing values using forward fill
df.ffill(inplace=True)

# Verify that there are no more missing values
print(df.isnull().sum())


date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64


In [8]:
# Select only numeric columns for IQR calculation
numeric_df = df.select_dtypes(include=[float, int])

# Calculate the IQR for numeric columns
Q1 = numeric_df.quantile(0.25)
Q3 = numeric_df.quantile(0.75)
IQR = Q3 - Q1

# Filter out the outliers
numeric_df_no_outliers = numeric_df[~((numeric_df < (Q1 - 1.5 * IQR)) | (numeric_df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Combine the non-numeric columns back with the filtered numeric data
non_numeric_df = df.select_dtypes(exclude=[float, int])
df_filtered = pd.concat([numeric_df_no_outliers, non_numeric_df], axis=1).reindex(df.index)


In [9]:
# Example: Correcting any negative precipitation values (if any)
df_filtered['precipitation'] = df_filtered['precipitation'].apply(lambda x: max(x, 0))


In [10]:
import pandas as pd

# Load the dataset
df = pd.read_csv('seattle-weather.csv')

# Display the first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Handle missing values using forward fill
df.ffill(inplace=True)

# Verify that there are no more missing values
print(df.isnull().sum())

# Select only numeric columns for IQR calculation
numeric_df = df.select_dtypes(include=[float, int])

# Calculate the IQR for numeric columns
Q1 = numeric_df.quantile(0.25)
Q3 = numeric_df.quantile(0.75)
IQR = Q3 - Q1

# Filter out the outliers
numeric_df_no_outliers = numeric_df[~((numeric_df < (Q1 - 1.5 * IQR)) | (numeric_df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Combine the non-numeric columns back with the filtered numeric data
non_numeric_df = df.select_dtypes(exclude=[float, int])
df_filtered = pd.concat([numeric_df_no_outliers, non_numeric_df], axis=1).reindex(df.index)

# Check Data Consistency
# Example: Correcting any negative precipitation values (if any)
df_filtered['precipitation'] = df_filtered['precipitation'].apply(lambda x: max(x, 0))

# Export the cleaned data to a CSV file
df_filtered.to_csv('cleaned_seattle_weather.csv', index=False)

# Display the cleaned dataset
print(df_filtered.head())


         date  precipitation  temp_max  temp_min  wind  weather
0  2012-01-01            0.0      12.8       5.0   4.7  drizzle
1  2012-01-02           10.9      10.6       2.8   4.5     rain
2  2012-01-03            0.8      11.7       7.2   2.3     rain
3  2012-01-04           20.3      12.2       5.6   4.7     rain
4  2012-01-05            1.3       8.9       2.8   6.1     rain
date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64
date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64
   precipitation  temp_max  temp_min  wind        date  weather
0            0.0      12.8       5.0   4.7  2012-01-01  drizzle
1            NaN       NaN       NaN   NaN  2012-01-02     rain
2            0.8      11.7       7.2   2.3  2012-01-03     rain
3            NaN       NaN       NaN   NaN  2012-01-04     rain
4            1.3       8.9       2.8   6.1