In [34]:
import numpy as np
import pandas as pd
import math
import sys

from IJCAI2017_TOOL import *

In [18]:
WEATHER_raw = pd.read_csv('./WEATHER_raw.csv',encoding = 'gbk',low_memory=False)

In [19]:
WEATHER_raw.head() 

Unnamed: 0,Port,DATE,Time,Temp,Bodytemp,Dew,Humidity,Pressure,Visibility,Wind_dir,Wind_speed,Gust_speed,Event,Condition,CITY_EN
0,ZSSS,2015-05-01,12:00 AM,16.0,-,14.0,88%,1014,8.0,ESE,10.8,-,,Clear,shanghai
1,ZSSS,2015-05-01,12:30 AM,16.0,-,14.0,88%,1013,8.0,SE,14.4,-,,Unknown,shanghai
2,ZSSS,2015-05-01,1:00 AM,16.0,-,14.0,88%,1013,8.0,SSE,10.8,-,,Unknown,shanghai
3,ZSSS,2015-05-01,1:30 AM,16.0,-,14.0,88%,1013,7.0,SSE,14.4,-,,Scattered Clouds,shanghai
4,ZSSS,2015-05-01,2:00 AM,16.0,-,14.0,88%,1013,7.0,SE,14.4,-,,Mostly Cloudy,shanghai


In [20]:
#人体舒适度指数计算公式
#(ssd)=(1.818t+18.18)(0.88+0.002f)+(t-32)/(45-t)-3.2v+18.2。
#其中t为平均气温，f为相对湿度，v为风速。
def SSD(Temp,Velo,Humi):
    score = (1.818*Temp+18.18) * (0.88+0.002*Humi) + 1.0*(Temp -32)/(45-Temp) - 3.2*Velo  + 18.2
    return score

def AMPM2decimal(ser):
    tt = ser.replace(' ',':').split(':')
    tt[0] = np.int(tt[0])%12 
    if (tt[2] == 'AM'):
        return np.float(tt[0]) + np.float(tt[1])/60.
    if (tt[2] == 'PM'):
        return np.float(tt[0]) + np.float(tt[1])/60. + 12.
        
def Eventclean(ser):
    try:
        if (math.isnan(ser)):
            return 'None'
    except:
        tt = ser.replace('\n','\r').replace('\t','\r').split('\r')
        tt2 = ''.join(tt)
        return tt2

In [21]:
WEATHER_raw = WEATHER_raw[['DATE','Time','Temp','Visibility','Wind_speed','Humidity','Event','Condition','CITY_EN']]

In [22]:
WEATHER_raw.head()

Unnamed: 0,DATE,Time,Temp,Visibility,Wind_speed,Humidity,Event,Condition,CITY_EN
0,2015-05-01,12:00 AM,16.0,8.0,10.8,88%,,Clear,shanghai
1,2015-05-01,12:30 AM,16.0,8.0,14.4,88%,,Unknown,shanghai
2,2015-05-01,1:00 AM,16.0,8.0,10.8,88%,,Unknown,shanghai
3,2015-05-01,1:30 AM,16.0,7.0,14.4,88%,,Scattered Clouds,shanghai
4,2015-05-01,2:00 AM,16.0,7.0,14.4,88%,,Mostly Cloudy,shanghai


In [24]:
WEATHER_raw['Time']  = [(lambda x:AMPM2decimal(x) ) (x) for x in  WEATHER_raw['Time']]  #处理时间格式 去掉AM PM
#x=WEATHER_raw.iloc[1,1]

In [28]:
WEATHER_raw.head()

Unnamed: 0,DATE,Time,Temp,Visibility,Wind_speed,Humidity,Event,Condition,CITY_EN
0,2015-05-01,0.0,16.0,8.0,10.8,88%,,Clear,shanghai
1,2015-05-01,0.5,16.0,8.0,14.4,88%,,Unknown,shanghai
2,2015-05-01,1.0,16.0,8.0,10.8,88%,,Unknown,shanghai
3,2015-05-01,1.5,16.0,7.0,14.4,88%,,Scattered Clouds,shanghai
4,2015-05-01,2.0,16.0,7.0,14.4,88%,,Mostly Cloudy,shanghai


In [36]:
WEATHER_raw['Event'] = [(lambda x:Eventclean(x) ) (x) for x in  WEATHER_raw['Event']]   #处理事件格式

In [38]:
#处理可见度 填充空值
#pandas.DataFrame.fillna() 参数method有四种选项 ffill/pad表示根据前面的值填充 backfill/bfill表示根据后面的值填充  
WEATHER_raw['Visibility'] = WEATHER_raw['Visibility'].replace('-',np.nan).fillna(method='ffill')
WEATHER_raw['Visibility'] = pd.to_numeric(WEATHER_raw['Visibility'], errors='ignore')#字符串转数值

In [39]:
WEATHER_raw['Temp'] = WEATHER_raw['Temp'].replace('-',0.0)
WEATHER_raw['Temp'] = pd.to_numeric(WEATHER_raw['Temp'], errors='ignore')

In [40]:
#处理风速 数值化
WEATHER_raw.loc[ WEATHER_raw['Wind_speed'] == 'Calm','Wind_speed']= 0.0
WEATHER_raw['Wind_speed'] = WEATHER_raw['Wind_speed'].replace('-','3.6')
WEATHER_raw['Wind_speed'] = pd.to_numeric(WEATHER_raw['Wind_speed'], errors='ignore')
WEATHER_raw['Wind_speed'] = WEATHER_raw['Wind_speed']/3.6

In [41]:
#湿度
WEATHER_raw['Humidity'] = WEATHER_raw['Humidity'].replace('N/A%','5%')
WEATHER_raw.loc[ WEATHER_raw['Humidity'] == '%','Humidity']= '5%'
WEATHER_raw['Humidity'] = [(lambda x: (np.int(x.split('%')[0]) ) ) (x) for x in WEATHER_raw['Humidity']]

In [42]:
#计算人体舒适指数
WEATHER_raw['SSD'] = SSD(WEATHER_raw['Temp'] ,WEATHER_raw['Wind_speed'],WEATHER_raw['Humidity'])

In [43]:
#天气状况
WEATHER_raw.loc[ WEATHER_raw['Condition'] == 'Unknown','Condition']= np.nan
WEATHER_raw['Condition'] = WEATHER_raw['Condition'].fillna(method='ffill')

In [44]:
WEATHER_CON_LEVEL = pd.read_csv('WEATHER_CON_LEVEL.csv')

In [45]:
WEATHER_CON_LEVEL.head()

Unnamed: 0,Condition,Con_Num,Con_Pct,LEVEL,RAIN_IND,CLEAR_IND
0,Clear,672861,0.251215,1,,1.0
1,Mostly Cloudy,295609,0.110367,0,,1.0
2,Mist,291402,0.108796,2,,
3,Scattered Clouds,266207,0.099389,0,,1.0
4,Unknown,206857,0.077231,1,,


In [46]:
WEATHER_raw = pd.merge(WEATHER_raw, WEATHER_CON_LEVEL, on = 'Condition', how = 'left')
WEATHER_raw[['RAIN_IND','CLEAR_IND']] = WEATHER_raw[['RAIN_IND','CLEAR_IND']].fillna(0.0)

In [47]:
WEATHER_raw = WEATHER_raw[['DATE','Time','CITY_EN','SSD','RAIN_IND','CLEAR_IND']]  #舒适度、降水指数、天晴指数

In [48]:
WEATHER_raw.head() 

Unnamed: 0,DATE,Time,CITY_EN,SSD,RAIN_IND,CLEAR_IND
0,2015-05-01,0.0,shanghai,57.963284,0.0,1.0
1,2015-05-01,0.5,shanghai,54.763284,0.0,1.0
2,2015-05-01,1.0,shanghai,57.963284,0.0,1.0
3,2015-05-01,1.5,shanghai,54.763284,0.0,1.0
4,2015-05-01,2.0,shanghai,54.763284,0.0,1.0


In [49]:
time1 = WEATHER_raw[((WEATHER_raw['Time']<=18.5) & ((WEATHER_raw['Time']>=11)) )]

In [50]:
time1.head()

Unnamed: 0,DATE,Time,CITY_EN,SSD,RAIN_IND,CLEAR_IND
22,2015-05-01,11.0,shanghai,61.904897,0.0,1.0
23,2015-05-01,11.5,shanghai,62.76568,0.0,1.0
24,2015-05-01,12.0,shanghai,62.76568,0.0,1.0
25,2015-05-01,12.5,shanghai,64.71644,0.0,1.0
26,2015-05-01,13.0,shanghai,66.154147,0.0,1.0


In [56]:
time1_group = time1.groupby(['CITY_EN','DATE'],as_index = False).mean()

In [58]:
time1_group.tail(50)

Unnamed: 0,CITY_EN,DATE,Time,SSD,RAIN_IND,CLEAR_IND
74187,zigong,2016-11-12,14.5,59.997342,0.0,1.0
74188,zigong,2016-11-13,14.5,62.690217,0.0,0.0
74189,zigong,2016-11-14,14.5,63.773966,0.0,0.0
74190,zigong,2016-11-15,14.5,63.766724,0.0,0.25
74191,zigong,2016-11-16,14.5,67.73323,0.0,0.25
74192,zigong,2016-11-17,14.5,70.393359,0.0,0.0
74193,zigong,2016-11-18,14.5,69.281506,0.0,0.375
74194,zigong,2016-11-19,14.5,69.934733,0.0,0.25
74195,zigong,2016-11-20,14.5,70.04939,0.0,0.75
74196,zigong,2016-11-21,14.5,65.836677,0.125,0.875


In [59]:
time1_group['SSD_C'] = np.abs(time1_group['SSD']-60) - np.abs(time1_group['SSD'].shift(1) -60)  #shift

In [60]:
time1_group.head()

Unnamed: 0,CITY_EN,DATE,Time,SSD,RAIN_IND,CLEAR_IND,SSD_C
0,ankang,2015-05-01,14.5,55.881064,0.5,0.625,
1,ankang,2015-05-02,14.5,70.259526,0.0,0.625,6.14059
2,ankang,2015-05-03,14.5,65.619166,0.125,0.875,-4.64036
3,ankang,2015-05-04,14.5,66.778863,0.0,0.625,1.159697
4,ankang,2015-05-05,14.5,73.809607,0.0,1.0,7.030744


In [61]:
time1_group = time1_group[((time1_group['DATE']<='2016-11-20') &(time1_group['DATE']>='2015-06-26')) ]

In [62]:
time1_group = time1_group.rename(columns = {'SSD':'RC','SSD_C':'RE','RAIN_IND':'RG','CLEAR_IND':'RI'})

In [63]:
time1_group.head()

Unnamed: 0,CITY_EN,DATE,Time,RC,RG,RI,RE
56,ankang,2015-06-26,14.5,66.724308,0.875,0.125,-13.237872
57,ankang,2015-06-27,14.5,60.090394,0.625,0.375,-6.633914
58,ankang,2015-06-28,14.5,64.276943,0.25,0.75,4.186549
59,ankang,2015-06-29,14.5,79.057921,0.0,1.0,14.780978
60,ankang,2015-06-30,14.5,80.054664,0.0,0.875,0.996743


In [64]:
time1_group = time1_group[['CITY_EN','DATE','RC','RE','RG','RI']]
time1_group.to_csv('WEATHER_FEATURES.csv',index = False)