In [1]:
# All 545110-... is weather from Beijing Airport

In [2]:
import pandas as pd
import numpy as np

In [3]:
def get_weather(file):
    with open(file,'r') as fin:
        lines = fin.readlines()
    return lines

In [6]:
f15 = get_weather('545110-99999-2015')

In [14]:
# Control Data Section
f15[0][0:60]

'0099545110999992015010100004+39933+116283FM-12+005599999V020'

In [24]:
# Mandatory Data Section
f15[0][59:106]

'03401N001019999999N010000199-00721-02041103491A'

In [19]:
len(f15)

20395

In [20]:
24*265

6360

In [23]:
f15[0][15:28]

'2015010100004'

### Create Dataset from strings

In [25]:
f16 = get_weather('545110-99999-2016')
f17 = get_weather('545110-99999-2017')

In [143]:
def extract_data(line):
    # Remove ending \n
    line = line.strip()
    ## Control Data Section
    # Pos. 1-4 - Total varchar
    total_chars = line[:4]
    # Pos 5-10 - USAF weather station
    # 11-15 - WBAN id
    # 16 - 23 date
    date = line[15:23]
    # time/hour
    time_ = line[23:27]
    #### Mandatory Data Section
    ## Dew point 
    # The temperature to which a given parcel of air 
    # must be cooled at constant pressure and water vapor content in order for saturation to occur.
    dew_point = line[93:98]
    # temperature in degrees Celcius
    air_temp = line[87:92]
    # Air Pressure
    # The air pressure relative to Mean Sea Level (MSL).
    # Units - Hectopascals
    
    air_pressure = line[99:104]
    if int(air_pressure) == 99999:
        air_pressure = 'NaN'
    ## Combined wind direction
    # The angle, measured in a clockwise direction, between true north and 
    # the direction from which the wind is blowing.
    wind_dir = line[60:63]
    if int(wind_dir) == 999:
        wind_dir = 'NaN'
    ## Cumulated wind speed
    # The rate of horizontal travel of air past a fixed point
    wind_speed = line[65:69]
    
    ###### Additional Data Section
    chars_add = line[105:108]
    num_rain_attrs = line[108:111]
    # Cumulated hours of snow
    try:
        variable_index_snow = line.index('L')
        num_hours_snow = line[variable_index_snow + 2: variable_index_snow + 4]
    except ValueError as e:
        num_hours_snow = 'NaN'
        
    # Cumulated hours of rain
    try:
        var_index_additional_sec = line.index('ADD')
        num_hours_rain = int(line[var_index_additional_sec + 6: var_index_additional_sec + 8])
    except ValueError as e:
        num_hours_rain = 'NaN'
    if num_hours_rain == 99:
        num_hours_rain = 'NaN'
        
    #print(line)
    return tuple((date, time_, dew_point, air_temp, air_pressure, wind_dir, wind_speed, 
                 num_hours_snow, num_hours_rain))

def prepare_df(lines):
    return [extract_data(line) for line in lines]

def make_df(first_year, additional_years):
    all_years = first_year
    for year in additional_years:
        all_years += year
    all_years = prepare_df(all_years)
    columns = ['date', 'time', 'dew_point', 'air_temp', 'air_pressure','wind_dir', 'wind_speed',
              'cumulative_snow_hours', 'cumulative_rain_hours']
    return pd.DataFrame(all_years, columns=columns)

In [144]:
extract_data(f15[0])

('20150101', '0000', '-0204', '-0072', '10349', '340', '0010', 'NaN', 'NaN')

In [145]:
df = make_df(f15, [f16, f17])

In [146]:
df['year'] = df['date'].apply(lambda x: str(x)[:4])
df['month'] = df['date'].apply(lambda x: int(str(x)[4:6]))
df['day'] = df['date'].apply(lambda x: int(str(x)[6:8]))
df['hour'] = df['time'].apply(lambda x: int(str(x)[:2]))
df['datetime'] = df['date'] + df['time']

In [147]:
df_unique_datetime = df.drop_duplicates(subset='datetime')

In [148]:
df_unique_datetime['datehour'] = df_unique_datetime['date'] +\
    df_unique_datetime['hour'].apply(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [149]:
df_unique_hour = df_unique_datetime.drop_duplicates(subset='datehour')

In [150]:
final = df_unique_hour.copy()

In [151]:
final = final.drop(['date', 'time', 'datetime', 'datehour'], axis=1, inplace=False)

In [153]:
final

Unnamed: 0,dew_point,air_temp,air_pressure,wind_dir,wind_speed,cumulative_snow_hours,cumulative_rain_hours,year,month,day,hour
0,-0204,-0072,10349,340,0010,,,2015,1,1,0
3,-0220,-0050,,020,0020,,10,2015,1,1,1
5,-0220,-0030,,030,0030,,10,2015,1,1,2
7,-0225,-0012,10340,180,0030,,71,2015,1,1,3
10,-0220,+0000,,,0020,,10,2015,1,1,4
12,-0220,+0010,,160,0030,,10,2015,1,1,5
14,-0234,+0022,10289,220,0030,,6,2015,1,1,6
17,-0230,+0020,,200,0030,,10,2015,1,1,7
19,-0240,+0020,,180,0040,,10,2015,1,1,8
21,-0228,+0001,10269,220,0010,,1,2015,1,1,9


In [152]:
final.to_csv('beijing_weather_2015_2017.csv')

Attribute Information:

No: row number 
year: year of data in this row 
month: month of data in this row 
day: day of data in this row 
hour: hour of data in this row 
pm2.5: PM2.5 concentration (ug/m^3) 
DEWP: Dew Point (â„ƒ) 
TEMP: Temperature (â„ƒ) 
PRES: Pressure (hPa) 
cbwd: Combined wind direction 
Iws: Cumulated wind speed (m/s) 
Is: Cumulated hours of snow 
Ir: Cumulated hours of rain 