<img src = "https://ibm.box.com/shared/static/hhxv35atrvcom7qc1ngn632mkkg5d6l3.png", width = 200></img>

<h2, align=center> Toronto - Big Data University Meetup</h2>
<h1, align=center> Data Mining Algorithms</h1>
<h3, align=center> October 26, 2015</h3>
<h4, align=center><a href = "linkedin.com/in/polonglin">Polong Lin</a></h4>
<h4, align=center><a href = "https://ca.linkedin.com/in/saeedaghabozorgi">Saeed Aghabozorgi</a></h4>

<hr>

# Weather Station Clustering 

##K-means using python & scikit-learn¶


Environment Canada		  
Monthly Values for July - 2015	  	
		
		
Stn_Name::::	Station Name  
Lat	    ::::	Latitude (North + , degrees)  
Long	::::	Longitude (West - , degrees)  
Prov	::::	Province  
Tm	    ::::	Mean Temperature (Â°C)  
DwTm	::::	Days without Valid Mean Temperature  
D	    ::::	Mean Temperature difference from Normal (1981-2010) (Â°C)  
Tx	    ::::	Highest Monthly Maximum Temperature (Â°C)  
DwTx	::::	Days without Valid Maximum Temperature  
Tn	    ::::	Lowest Monthly Minimum Temperature (Â°C)  
DwTn	::::	Days without Valid Minimum Temperature  
S	    ::::	Snowfall (cm)  
DwS	    ::::	Days without Valid Snowfall  
S%N	    ::::	Percent of Normal (1981-2010) Snowfall  
P	    ::::	Total Precipitation (mm)  
DwP	    ::::	Days without Valid Precipitation  
P%N	    ::::	Percent of Normal (1981-2010) Precipitation  
S_G  	::::	Snow on the ground at the end of the month (cm)  
Pd	    ::::	Number of days with Precipitation 1.0 mm or more  
BS	    ::::	Bright Sunshine (hours)  
DwBS	::::	Days without Valid Bright Sunshine  
BS%  	::::	Percent of Normal (1981-2010) Bright Sunshine  
HDD 	::::	Degree Days below 18 Â°C  
CDD	    ::::	Degree Days above 18 Â°C  
Stn_No	::::	Climate station identifier (first 3 digits indicate   drainage basin, last 4 characters are for sorting alphabetically).  
NA	    ::::	Not Available  

### 1-Download Data

In [None]:
!wget -O /resources/weather-stations20140101-20141231.csv https://ibm.box.com/shared/static/mv6g5p1wpmpvzoz6e5zgo47t44q8dvm0.csv

### 2- Load data

In [None]:
import csv
import pandas as pd
import numpy as np

filename='/resources/weather-stations20140101-20141231.csv'

#Read csv
pdf = pd.read_csv(filename)
pdf.head(5)

### 3- Clean data

In [None]:
pdf = pdf[pd.notnull(pdf["Tm"])]
pdf = pdf.reset_index(drop=True)
pdf.head(5)

### 4- Visualization of data

In [None]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = (14,10)

llon = -140
ulon = -50
llat = 40
ulat = 65

pdf = pdf[(pdf['Long'] > llon) & (pdf['Long'] < ulon) & (pdf['Lat'] > llat) &(pdf['Lat'] < ulat)]

my_map = Basemap(projection='merc',
            resolution = 'l', area_thresh = 1000.0,
            llcrnrlon=llon, llcrnrlat=llat, #min longitude (llcrnrlon) and latitude (llcrnrlat)
            urcrnrlon=ulon, urcrnrlat=ulat) #max longitude (urcrnrlon) and latitude (urcrnrlat)

my_map.drawcoastlines()
my_map.drawcountries()
my_map.drawmapboundary()
my_map.fillcontinents(color = 'white', alpha = 0.3)
my_map.shadedrelief()

# To collect data based on stations        

xs,ys = my_map(np.asarray(pdf.Long), np.asarray(pdf.Lat))
pdf['xm'] = xs.tolist()
pdf['ym'] =ys.tolist()

#Visualization1
for index,row in pdf.iterrows():
#   x,y = my_map(row.Long, row.Lat)
   my_map.plot(row.xm, row.ym,markerfacecolor =([1,0,0]),  marker='o', markersize= 5, alpha = 0.75)
#plt.text(x,y,stn)
plt.show()



### 5-Clustering based on temperature

In [None]:
from sklearn.cluster import KMeans
import sklearn.utils
from sklearn.preprocessing import StandardScaler
clusterNum = 7
sklearn.utils.check_random_state(1000)

Clus_dataSet = zip(np.asarray(pdf.xm),np.asarray(pdf.ym))
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet=StandardScaler().fit_transform(Clus_dataSet)
est = KMeans(n_clusters=clusterNum)
est.fit(Clus_dataSet)
labels = est.labels_
pdf["Clus_km"] = labels

# A sample of clusters
pdf[["Stn_Name","xm","ym","Tx","Tm","Clus_km"]].head(5)

### 6- Visualize the clusters

In [None]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = (14,10)

my_map = Basemap(projection='merc',
            resolution = 'l', area_thresh = 1000.0,
            llcrnrlon=llon, llcrnrlat=llat, #min longitude (llcrnrlon) and latitude (llcrnrlat)
            urcrnrlon=ulon, urcrnrlat=ulat) #max longitude (urcrnrlon) and latitude (urcrnrlat)

my_map.drawcoastlines()
my_map.drawcountries()
my_map.drawmapboundary()
my_map.fillcontinents(color = 'white', alpha = 0.3)
my_map.shadedrelief()

# To create a color map
colors = plt.get_cmap('jet')(np.linspace(0.0, 1.0, clusterNum))

#Visualization1
for index,row in pdf.iterrows():
    my_map.plot(row.xm, row.ym,markerfacecolor =colors[np.float(row.Clus_km)],  marker='o', markersize= 5, alpha = 0.75)

for i in range(clusterNum): 
    cluster = pdf[["Stn_Name","Tm","xm","ym","Clus_km"]][pdf.Clus_km==i]
    cenx = np.mean(cluster.xm) 
    ceny = np.mean(cluster.ym) 
    plt.text(cenx,ceny,str(i), fontsize=25, color='red',)
    print "Cluster "+str(i)+', Avg Temp: '+ str(np.mean(cluster.Tm))

In [None]:
from sklearn.cluster import KMeans
import sklearn.utils
from sklearn.preprocessing import StandardScaler
clusterNum = 7
sklearn.utils.check_random_state(1000)

Clus_dataSet = zip(np.asarray(pdf.xm),np.asarray(pdf.ym),np.asarray(pdf.Tx),np.asarray(pdf.Tn),np.asarray(pdf.Tm))
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet=StandardScaler().fit_transform(Clus_dataSet)
est = KMeans(n_clusters=clusterNum)
est.fit(Clus_dataSet)
labels = est.labels_
pdf["Clus_km"]=labels

# A sample of clusters
pdf[["Stn_Name","xm","ym","Tx","Tm","Clus_km"]].head(5)

In [None]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = (14,10)

my_map = Basemap(projection='merc',
            resolution = 'l', area_thresh = 1000.0,
            llcrnrlon=llon, llcrnrlat=llat, #min longitude (llcrnrlon) and latitude (llcrnrlat)
            urcrnrlon=ulon, urcrnrlat=ulat) #max longitude (urcrnrlon) and latitude (urcrnrlat)

my_map.drawcoastlines()
my_map.drawcountries()
my_map.drawmapboundary()
my_map.fillcontinents(color = 'white', alpha = 0.3)
my_map.shadedrelief()

# To create a color map
colors = plt.get_cmap('jet')(np.linspace(0.0, 1.0, clusterNum))

#Visualization1
for index,row in pdf.iterrows():
    my_map.plot(row.xm, row.ym,markerfacecolor =colors[np.float(row.Clus_km)],  marker='o', markersize= 5, alpha = 0.75)

for i in range(clusterNum): 
    cluster = pdf[["Stn_Name","Tm","xm","ym","Clus_km"]][pdf.Clus_km==i]
    cenx=np.mean(cluster.xm) 
    ceny=np.mean(cluster.ym) 
    plt.text(cenx,ceny,str(i), fontsize=25, color='red',)
    print "Cluster "+str(i)+', Avg Temp: '+ str(np.mean(cluster.Tm))