In [46]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [39]:
DATA_PATH = './data'

train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

# train_origin = train.copy()
# test_origin = test.copy()

print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (79023, 76)
test shape: (24353, 75)


In [40]:
test['emission'] = None
test = test[['ID_LAT_LON_YEAR_WEEK', 'emission']]

In [41]:
test

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
0,ID_-0.510_29.290_2022_00,
1,ID_-0.510_29.290_2022_01,
2,ID_-0.510_29.290_2022_02,
3,ID_-0.510_29.290_2022_03,
4,ID_-0.510_29.290_2022_04,
...,...,...
24348,ID_-3.299_30.301_2022_44,
24349,ID_-3.299_30.301_2022_45,
24350,ID_-3.299_30.301_2022_46,
24351,ID_-3.299_30.301_2022_47,


In [42]:
test_ids = test['ID_LAT_LON_YEAR_WEEK'].values
test_ids

array(['ID_-0.510_29.290_2022_00', 'ID_-0.510_29.290_2022_01',
       'ID_-0.510_29.290_2022_02', ..., 'ID_-3.299_30.301_2022_46',
       'ID_-3.299_30.301_2022_47', 'ID_-3.299_30.301_2022_48'],
      dtype=object)

In [47]:
for i in test_ids:
    # ID_LAT_LON
    id_lat_lon_idx = i.index('2022')
    id_lat_lon = i[:id_lat_lon_idx - 1]

    # week
    week = i[-2:]

    # train에서 같은 ID 찾기
    df = train.loc[train['ID_LAT_LON_YEAR_WEEK'].str.startswith(id_lat_lon)]
    df = df.loc[df['ID_LAT_LON_YEAR_WEEK'].str.endswith(week)]

    # 상관계수 구하기
    corr = df[['year', 'emission']].corr()
    if abs(corr['emission']['year']) >= 0.8:
        X = np.expand_dims(df.loc[:, 'year'].values, axis=1)
        y = np.expand_dims(df.loc[:, 'emission'].values, axis=1)
        
        lr = LinearRegression(fit_intercept=True)

        lr.fit(X, y)
        value = lr.predict([[2022]])
        
    else:
        value = None
    
    test.loc[test['ID_LAT_LON_YEAR_WEEK'] == i, 'emission'] = value

In [49]:
# test.to_csv('test_origin.csv')

In [50]:
test.isna().sum()

ID_LAT_LON_YEAR_WEEK        0
emission                19188
dtype: int64

In [63]:
test_new = test.copy()

In [64]:
test_new

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
0,ID_-0.510_29.290_2022_00,3.176613
1,ID_-0.510_29.290_2022_01,
2,ID_-0.510_29.290_2022_02,3.938143
3,ID_-0.510_29.290_2022_03,3.962625
4,ID_-0.510_29.290_2022_04,4.062005
...,...,...
24348,ID_-3.299_30.301_2022_44,
24349,ID_-3.299_30.301_2022_45,
24350,ID_-3.299_30.301_2022_46,
24351,ID_-3.299_30.301_2022_47,


In [69]:
test_new_na = test_new.loc[test_new['emission'].isna()]
test_new_na

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
1,ID_-0.510_29.290_2022_01,
5,ID_-0.510_29.290_2022_05,
8,ID_-0.510_29.290_2022_08,
10,ID_-0.510_29.290_2022_10,
11,ID_-0.510_29.290_2022_11,
...,...,...
24348,ID_-3.299_30.301_2022_44,
24349,ID_-3.299_30.301_2022_45,
24350,ID_-3.299_30.301_2022_46,
24351,ID_-3.299_30.301_2022_47,


In [70]:
test_new_na_ids = test_new_na['ID_LAT_LON_YEAR_WEEK'].values
test_new_na_ids

array(['ID_-0.510_29.290_2022_01', 'ID_-0.510_29.290_2022_05',
       'ID_-0.510_29.290_2022_08', ..., 'ID_-3.299_30.301_2022_46',
       'ID_-3.299_30.301_2022_47', 'ID_-3.299_30.301_2022_48'],
      dtype=object)

In [74]:
for i in test_new_na_ids:
    # ID_LAT_LON
    id_lat_lon_idx = i.index('2022')
    id_lat_lon = i[:id_lat_lon_idx - 1]

    # week
    week = i[-2:]

    # train에서 같은 ID 찾기
    df = train.loc[train['ID_LAT_LON_YEAR_WEEK'].str.startswith(id_lat_lon)]
    df = df.loc[df['ID_LAT_LON_YEAR_WEEK'].str.endswith(week)]

    # emission 최대 최소 차이가 5 미만일 경우
    em_max = df['emission'].max()
    em_min = df['emission'].min()
    if em_max - em_min < 5:
        em_2019 = df.loc[df['year'] == 2019, 'emission'].values[0]
        em_2020 = df.loc[df['year'] == 2020, 'emission'].values[0]
        em_2021 = df.loc[df['year'] == 2021, 'emission'].values[0]
        value = em_2021 * 0.5 + em_2020 * 0.2 + em_2019 * 0.3

        test_new_na.loc[test_new_na['ID_LAT_LON_YEAR_WEEK'] == i, 'emission'] = value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_new_na.loc[test_new_na['ID_LAT_LON_YEAR_WEEK'] == i, 'emission'] = value


In [86]:
test_new_na.isna().sum()

ID_LAT_LON_YEAR_WEEK        0
emission                10471
dtype: int64

In [87]:
test_new_drop = test_new.dropna()

In [88]:
test_new2 = pd.concat([test_new_drop, test_new_na])
test_new2.sort_index(inplace=True)
test_new2

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
0,ID_-0.510_29.290_2022_00,3.176613
1,ID_-0.510_29.290_2022_01,4.003853
2,ID_-0.510_29.290_2022_02,3.938143
3,ID_-0.510_29.290_2022_03,3.962625
4,ID_-0.510_29.290_2022_04,4.062005
...,...,...
24348,ID_-3.299_30.301_2022_44,29.071867
24349,ID_-3.299_30.301_2022_45,29.417032
24350,ID_-3.299_30.301_2022_46,29.578025
24351,ID_-3.299_30.301_2022_47,30.109667


In [89]:
# test_new2.to_csv('test_origin2.csv')

In [90]:
test_new2.isna().sum()

ID_LAT_LON_YEAR_WEEK        0
emission                10471
dtype: int64

In [95]:
subset_ids = list(test_new2['ID_LAT_LON_YEAR_WEEK'])
subset_ids[:10]

['ID_-0.510_29.290_2022_00',
 'ID_-0.510_29.290_2022_01',
 'ID_-0.510_29.290_2022_02',
 'ID_-0.510_29.290_2022_03',
 'ID_-0.510_29.290_2022_04',
 'ID_-0.510_29.290_2022_05',
 'ID_-0.510_29.290_2022_06',
 'ID_-0.510_29.290_2022_07',
 'ID_-0.510_29.290_2022_08',
 'ID_-0.510_29.290_2022_09']

In [111]:
train_subset = pd.DataFrame(columns=train.columns)

In [112]:
train_subset

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission


In [105]:
temp = train.iloc[[1]]

In [113]:
temp

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176


In [108]:
train_subset

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission


In [110]:
train_subset.iloc[len(train_subset)] = temp

ValueError: cannot set a row with mismatched columns

In [33]:
ANNUAL_INCREASEMENT_RATIO = 1.09738621

In [36]:
test['emission'] = test['emission'] / ANNUAL_INCREASEMENT_RATIO
test

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
0,ID_-0.510_29.290_2022_00,3.535276
1,ID_-0.510_29.290_2022_01,3.998495
2,ID_-0.510_29.290_2022_02,4.131075
3,ID_-0.510_29.290_2022_03,4.194229
4,ID_-0.510_29.290_2022_04,4.254535
...,...,...
24348,ID_-3.299_30.301_2022_44,29.836845
24349,ID_-3.299_30.301_2022_45,30.249885
24350,ID_-3.299_30.301_2022_46,30.432126
24351,ID_-3.299_30.301_2022_47,30.86862


In [35]:
test.to_csv('test6_3.csv')