# Scraping

In [1]:
import numpy as np 
import pandas as pd 
import sys, requests, shutil, os
from urllib import request, error
from datetime import date, timedelta
import datetime

In [2]:
class Scraper:
    def __init__(self, region, root_region_url, base_url, verbose=False):
        self.damaged_file = []
        self.undownload_file = []
        self.verbose = verbose
        self.region = region
        self.root_region_url = root_region_url
        self.base_url = base_url
        self.root_dir = str("./input/")
        #Creating folder and subfolder given the format /input/[Region-zone]/[timestamp as date]/[IMG]
        self.utc_date_now = datetime.datetime.utcnow().date()
        dirs = ["./input"]
        for d in dirs:
            if not os.path.exists(d):
                os.makedirs(d)
    
    def get_file_path(self, img_name):#For assigning timestamp dir to Img
        utc_time = datetime.datetime.utcnow() #utc time right now
        #extract timestamp of file name
        img_hour = img_name.split("_")[-1].split(".")[0][:2]
        img_min = img_name.split("_")[-1].split(".")[0][2:]
        img_time = datetime.datetime.utcnow()
        img_time = img_time.replace(hour=int(img_hour), minute=int(img_min), second=0, microsecond=0)
        
        if self.verbose: #for debugging
            print("UTC time right now:"+str(utc_time)+", Img timestamp:"+str(img_time))
        file_path = str(self.root_dir)
        
        return file_path
    
    def fetch_img(self, path, file_path):
        url=path
        try:
            response=requests.get(url, stream=True)
        except requests.exceptions.ConnectionError:
            if self.verbose:
                print("Connection Abort") #INCASE OF FAILED LINK, SKIP IT! DOWNLOAD LATER
            return False
        if str(response.status_code) == "404":
            if self.verbose:
                print("404 error")
            return False
        with open(file_path+"/image.jpg", 'wb') as out_file: #download IMG as image.jpg, will change name later
            shutil.copyfileobj(response.raw, out_file)
        del response    
        return True
        
    def scraping(self):
        current = datetime.datetime.utcnow()- timedelta( minutes=20)#ต้องดึงของ10 นาทีก่อน
        current=current.strftime("%H%M") 
        time=int(current)-int(current)%10
        time=('0000'+str(time))[4:]

        band = "se1_b08"

        links = []
        req = request.Request(self.root_region_url)
        with request.urlopen(req) as response:
            html = response.read().decode("utf-8")#extract all imgs link from html
            for un_slice_url in html.split("<a href=")[1:]:
                if band in un_slice_url and str(time) in un_slice_url:
                    links.append(self.base_url + un_slice_url.split(">")[0])#fill extrated links into an array


        
        #for every picture links
        for link in links:
            #extract image name from url
            #Example: http://www.data.jma.go.jp/mscweb/data/himawari/img/se1/se1_b13_0000.jpg
            #will extract se1_b13_000.jpg from url above
            img_name = link.split("/")[-1]
            file_path = self.get_file_path(img_name)#to verify timestamp directory of this picture
            if not file_path:
                if self.verbose:
                    print("utc_time == img_time, Skip")
                self.undownload_file.append(link)
                continue
            
            if self.verbose:
                print("File path:"+file_path+"/"+img_name)
                
            if os.path.exists(file_path+'/'+img_name):#incase of download existing file
                if self.verbose:
                    print("File Exists")
                continue
            if self.fetch_img(link, file_path): #if Connection is success
                if self.verbose:
                    print("Connection success!")
                os.rename(file_path+'/image.jpg', file_path+'/'+ img_name) 
            else:
                self.damaged_file.append(link)
                
                return None
                #rename image.jpg into image format name
                #Example:se1_b13_0000.jpg
                #This format is AREA:BAND_TYPE:TIMESTAMP.jpg
        #write Damaged file and undownload file to log
        print(self.region+" Damaged file")
        for i in self.damaged_file:
            print(i)
        print("\n"+self.region+" Undownload file")
        for i in self.undownload_file:
            print(i)
        return img_name
    
#write log file if program terminated
def exit_handler(*args):
    for key in args:
        if len(key.damaged_file)>0:
            print(key.region+" Damaged file")
            for i in key.damaged_file:
                print(i)
        if len(key.undownload_file)>0:
            print("\n"+key.region+" Undownload file")
            for i in key.undownload_file:
                print(i)



# Preprocess

In [3]:
from line_killer import Img_preprocess
from shutil import copyfile
import os
import shutil

# Model

In [4]:
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as f
import torch
import numpy as np
import os
import sys
import torch
from torchvision import transforms, datasets
import torch.optim as optim
import matplotlib.pyplot as plt
from CovLstm_cell_simply import ConvLSTMCell as Covlstm_cell
import scipy.misc


In [5]:
class model(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Covlstm_cell(1,1)
        self.decoder = Covlstm_cell(1,1)
        #self.hidden_size = hidden_size
        #self.Convlstm_cell = Covlstm_cell(num_cell,hidden_size)
        #self.relu = nn.ReLU()
    def forward(self,data,epoch,T_en,T_de): #T_en = input sequence, T_de = output sequence
        encoder_state = None
        decoder_state = None
        for t in range(epoch, T_en):
              encoder_state = self.encoder.forward(data[t],encoder_state)
        decoder_input = encoder_state[0][0]
        decoder_input = decoder_input[:,None,:,:] 
        for t in range(0,T_de) :
            decoder_state = self.decoder.forward(decoder_input,decoder_state)
        y_pre = decoder_state[0][0][0]
        # Dont care about hidden states
        return y_pre

In [99]:
T_de = 20
lr = 0.001
print('Instantiate model')
m = model().cuda()
print(repr(m))

lendata = 100
seq = 20
lenpredict = 6

print('Create a MSE criterion')
loss_fn = nn.MSELoss().cuda()
print(loss_fn)

params = list(m.parameters()) 
print('optimizer Adam')
optimizer = optim.Adam(params, lr=lr)
print(optimizer)
index = 0
index_last_x = 0

Instantiate model
model(
  (encoder): ConvLSTMCell(
    (Gates): Conv2d(2, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
  (decoder): ConvLSTMCell(
    (Gates): Conv2d(2, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
)
Create a MSE criterion
MSELoss()
optimizer Adam
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)


  torch.nn.init.xavier_normal(self.Gates.weight)
  torch.nn.init.constant(self.Gates.bias, 0)


In [100]:
#load model
m.load_state_dict(torch.load('model\\model1.pth'))


In [5]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn1_1 = Covlstm_cell(1,1)
        self.rnn1_2 = Covlstm_cell(1,1)
        self.rnn1_3 = Covlstm_cell(1,1)
        self.rnn2_1 = Covlstm_cell(1,1)       
    def init_hiden(self):
        hidden = []
        hidden1_1 = None
        hidden1_2 = None
        hidden1_3 = None        
        hidden2_1 = None
        hidden.append(hidden1_1)
        hidden.append(hidden1_2)
        hidden.append(hidden1_3)
        hidden.append(hidden2_1)
        return hidden       
    def forward(self,data,hidden):
        hidden1_1 = hidden[0]
        hidden1_2 = hidden[1]
        hidden1_3 = hidden[2]        
        hidden2_1 = hidden[3] 
        hidden1_1 = self.rnn1_1.forward(data ,hidden1_1)               
        hidden1_2_input = hidden1_1[0][0]
        hidden1_2_input = hidden1_2_input[:,None,:,:] 
        hidden1_2 = self.rnn1_2.forward(hidden1_2_input,hidden1_2)       
        hidden1_3_input = hidden1_2[0][0]
        hidden1_3_input = hidden1_3_input[:,None,:,:] 
        hidden1_3 = self.rnn1_3.forward(hidden1_3_input,hidden1_3)      
        hidden2_1_input = hidden1_3[0][0]
        hidden2_1_input = hidden2_1_input[:,None,:,:] 
        hidden2_1 = self.rnn2_1.forward(hidden2_1_input ,hidden2_1)
        encoder_out = hidden2_1[0]
        hidden = []
        hidden.append(hidden1_1)
        hidden.append(hidden1_2)
        hidden.append(hidden1_3)
        hidden.append(hidden2_1)
        return encoder_out,hidden
    
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        num_c = [1, 1, 1]
        h = [1,1,1]
        self.rnn1_1 = Covlstm_cell(1,1)
        self.rnn1_2 = Covlstm_cell(1,1)
        self.rnn1_3 = Covlstm_cell(1,1)
        self.rnn2_1 = Covlstm_cell(1,1)
        
    def forward(self,data,hidden_en):
        hidden1_1 = hidden_en[3]
        hidden1_2 = hidden_en[2]
        hidden1_3 = hidden_en[1]     
        hidden2_1 = hidden_en[0]     
        hidden1_1 = self.rnn1_1.forward(data,hidden1_1)                
        hidden1_2_input = hidden1_1[0][0]
        hidden1_2_input = hidden1_2_input[:,None,:,:] 
        hidden1_2 = self.rnn1_2.forward(hidden1_2_input,hidden1_2)        
        hidden1_3_input = hidden1_2[0][0]
        hidden1_3_input = hidden1_3_input[:,None,:,:] 
        hidden1_3 = self.rnn1_3.forward(hidden1_3_input,hidden1_3)
        hidden2_1_input = hidden1_3[0][0]
        hidden2_1_input = hidden2_1_input[:,None,:,:] 
        hidden2_1 = self.rnn2_1.forward(hidden2_1_input ,hidden2_1)
        out = hidden2_1[0]
        hidden = []
        hidden.append(hidden1_1)
        hidden.append(hidden1_2)
        hidden.append(hidden1_3)
        hidden.append(hidden2_1)
        return out,hidden

In [6]:
class TraModel(nn.Module):
    def __init__(self):
        super().__init__()
        #input_size_c = 1 hidden_size = h
        self.enc  = Encoder()
        self.dec  = Decoder()
        
    def forward(self,data,epoch):
        hidden_en = self.enc.init_hiden()
        T_en = 9 # same seq
        T_en = T_en+epoch
        for t in range(epoch, T_en):
            enc_output,hidden_en = self.enc(data[t],hidden_en)
        #self.dec.init_h0(hidden_en)
        dec_output = enc_output

        for t in range(epoch, T_en):
            dec_output,hidden_en= self.dec(dec_output,hidden_en)
        dec_output = dec_output[0][0]
        return dec_output

In [7]:
model_3hr = TraModel()
dic_param = torch.load('model/model8_10seq_10000dataset.pt')
model_3hr.load_state_dict(dic_param)

  torch.nn.init.xavier_normal(self.Gates.weight)
  torch.nn.init.constant(self.Gates.bias, 0)


In [8]:
############# load data ################
from matplotlib.image import imread
import matplotlib.image as mpimg
from skimage import color
from scipy import ndimage, misc
import cloudy
def load_images(image_paths):
    # Load the images from disk.
    images = [color.rgb2gray(imread(path)) for path in image_paths]
    
    # Convert to a numpy array and return it.
    return np.asarray((images), dtype=np.float32)
current = cloudy.get_data_dir('input')

In [15]:
now=datetime.datetime.utcnow()
start = now-timedelta(minutes=200)
target=now.strftime('%H%M')[:3]
for i in range(len(current)):
    name= current[i][-8:-5]
    if target == name:
        break
temp=current[:i+1]
current=current[-20+len(temp):]
current=current+temp

In [114]:
def predict_and_save(train_dir):
    train_data = load_images(train_dir)
    Nx_input = torch.from_numpy(train_data).cuda()
    torch.manual_seed(0)
    x_input = Nx_input[:]/255
    x_input = x_input[:,None,None,:,:]
    x_input = Variable(x_input).cuda()

    epoch=0
    T_en = 20
    T_en = T_en+epoch
    output = m(x_input,epoch,T_en,T_de)
    img = output.cpu()
    img = img.data.numpy()
    #img.shape
    img = img*255
    image_name = "prediction\\"+train_dir[-1][-16:]
    print(image_name)
    scipy.misc.imsave(image_name, img)
    return img

In [9]:
def predict_and_save_3hr(train_dir):
    seq = 9
    train_data = load_images(train_dir)
    Nx_input = torch.from_numpy(train_data)
    torch.manual_seed(0)
    x_input = Nx_input[:]/255
    x_input = x_input[:,None,None,:,:]
    x_input = Variable(x_input)
    epoch=0
    T_en = 9
    #T_en = T_en+epoch
    output = model_3hr(x_input,epoch)
    img = output.cpu() 
    img = img.data.numpy()
    #img.shape
    img = img*255
    image_name = "prediction_3hr\\"+train_dir[-1][-16:]
    print(image_name)
    scipy.misc.imsave(image_name, img)
    return img

In [10]:
current=current[-9:]

In [11]:
img = predict_and_save_3hr(current)

RuntimeError: Expected a Tensor of type torch.FloatTensor but found a type torch.cuda.FloatTensor for sequence element 1 in sequence argument at position #1 'tensors'

# FTP

In [14]:
from ftplib import FTP

def placeFile(ftp,filename,filedir):
    try:
        ftp.delete(filename)
    except Exception as e:
        print(e.message)
    status = ftp.storbinary('STOR '+ filename, open(filedir,'rb')) #rb
    print("upload file",filename)
    print(status)

In [18]:
ftp = FTP('waws-prod-dm1-119.ftp.azurewebsites.windows.net')
ftp.login(user='deepsky\$deepsky',
      passwd='khqWN6sZlqler3u4CGxoHX4wWF1jeqCC2YfpCiEjiLPTvhrbc4ffuvn0gq1s')
ftp.cwd('site/public/storage/images')

'250 CWD command successful.'

In [19]:
img_name='se1_b08_0320.jpg'
placeFile(ftp,filename=img_name,filedir='input1\\'+img_name)

upload file se1_b08_0320.jpg
226 Transfer complete.


# schedule for every 10 mins

In [18]:
import sched, time
s = sched.scheduler(time.time, time.sleep)

In [117]:
#def real_time_update(sc):
##scrap part
try:
    print("try scraping")
    SE1 = Scraper(region = "se1",
                 root_region_url = "http://www.data.jma.go.jp/mscweb/data/himawari/list_se1.html",
                 base_url = "http://www.data.jma.go.jp/mscweb/data/himawari/",
                 verbose = True)

    try:
        img_name=SE1.scraping()
    except KeyboardInterrupt:
        #rint("fail scraping")
        exit_handler(SE1)
    #    
    if img_name is not None:
        path = os.path.abspath('input\\'+img_name)
        print("store image")
        #store original image
        ftp = FTP('waws-prod-dm1-119.ftp.azurewebsites.windows.net')
        ftp.login(user='deepsky\$deepsky',
              passwd='khqWN6sZlqler3u4CGxoHX4wWF1jeqCC2YfpCiEjiLPTvhrbc4ffuvn0gq1s')
        ftp.cwd('site/public/storage/images')
        placeFile(ftp,filename=img_name,filedir='input\\'+img_name)
        print("preprocess")
        #preprocess
        preprocess = Img_preprocess(filepath = path)
        try:
            img = preprocess.clear_green(region='se1')
            preprocess.save_img(img,name=path)
        except Exception:
            print('preprocess flail',path)
            #
        current.append(path)
        while len(current)>20:
            current.pop(0)
        print("predict")
        img = predict_and_save(current);

        print("store prediction")
        #store prediction
        ftp.cwd('../')
#         ftp.retrlines('LIST')    
        ftp.cwd('next-1hr')
        placeFile(ftp,filename=img_name,filedir='prediction\\'+img_name)
        ftp.cwd('../')
        ftp.cwd('prediction')
        ftp.quit()


    else:
        print("fail to predict")
except Exception:
    print("fail to scrappingt")
#s.enter(600, 1, real_time_update, (sc,))


try scraping
UTC time right now:2019-01-10 14:08:36.233162, Img timestamp:2019-01-10 13:40:00
File path:./input//se1_b08_1340.jpg
File Exists
se1 Damaged file

se1 Undownload file
store image
upload file se1_b08_1340.jpg
226 Transfer complete.
preprocess
preprocess flail C:\Users\tanintem\Desktop\deepsky real time updater\input\se1_b08_1340.jpg
predict


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


prediction\se1_b08_1340.jpg
store prediction
upload file se1_b08_1340.jpg
226 Transfer complete.


In [123]:
now=datetime.datetime.utcnow()
now=now.strftime('%H%M')
now=now.replace(now[-1],'0')
now

'1410'

In [None]:
s.enter(1, 1, real_time_update, (s,))
s.run()

try scraping
UTC time right now:2019-01-10 08:38:05.099822, Img timestamp:2019-01-10 08:10:00
File path:./input//se1_b08_0810.jpg
File Exists
UTC time right now:2019-01-10 08:38:05.099822, Img timestamp:2019-01-10 18:10:00
File path:./input//se1_b08_1810.jpg
File Exists
se1 Damaged file

se1 Undownload file
store image
upload file se1_b08_1810.jpg
226 Transfer complete.
preprocess
preprocess flail C:\Users\tanintem\Desktop\deepsky real time updater\input\se1_b08_1810.jpg
predict


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


prediction\se1_b08_1810.jpg
store prediction
upload file se1_b08_1810.jpg
226 Transfer complete.
try scraping
UTC time right now:2019-01-10 08:48:14.748746, Img timestamp:2019-01-10 08:20:00
File path:./input//se1_b08_0820.jpg
File Exists
UTC time right now:2019-01-10 08:48:14.748746, Img timestamp:2019-01-10 18:20:00
File path:./input//se1_b08_1820.jpg
File Exists
se1 Damaged file

se1 Undownload file
store image
upload file se1_b08_1820.jpg
226 Transfer complete.
preprocess
preprocess flail C:\Users\tanintem\Desktop\deepsky real time updater\input\se1_b08_1820.jpg
predict
prediction\se1_b08_1820.jpg
store prediction
upload file se1_b08_1820.jpg
226 Transfer complete.
try scraping
UTC time right now:2019-01-10 08:58:23.552746, Img timestamp:2019-01-10 08:30:00
File path:./input//se1_b08_0830.jpg
File Exists
UTC time right now:2019-01-10 08:58:23.553748, Img timestamp:2019-01-10 18:30:00
File path:./input//se1_b08_1830.jpg
File Exists
se1 Damaged file

se1 Undownload file
store image
u