# Final Project - Clustering Volcanoes according to eruption time

The 'eruptions.csv' file contains 10000 rows of volcanic activity information. The first column is the duration of the eruption, and the second is time between two eruptions. Clustering should be applied on the entire dataset. 

### Perform the following operations:
1. Read the dataset in python using pandas. Plot a scatter of the two columns. 
2. Run the Kmeans algorithm on this dataset(using random). Choose the appropriate number of clusters for this data. 
3. Plot the output of Kmeans, colored according to clusters and obtained centroids. 
5. For each cluster plot the line from the centroid to the farthest point in the cluster. 
6. Find out all the points in the cluster which have a distance greater than the mean of the cluster, . 
6. What is average distance of each cluster? (From the centroid to all points in the cluster)
7. For the following points, find out which cluster they fall in?  
dur_eruptions = 2.5, 3.0, 3.2, 4.5, 5.0, 1.5  
wait_time_btn_eruptions = 70, 55, 70, 80, 70, 100  
Plot these points in separate markers & color along with the clusters

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np

In [None]:
df = pd.read_csv('eruptions.csv')
df.head()

### Read the dataset in python using pandas. Plot a scatter of the two columns.

In [None]:
plt.scatter(df['dur_eruptions'],df['wait_time_btn_eruptions'] )
plt.xlabel('dur_eruptions')
plt.ylabel('wait_time_btn_eruptions')
plt.title('dur_eruptions vs wait_time_btn_eruptions')
plt.legend()
plt.grid()

### Run the Kmeans algorithm on this dataset(using random). Choose the appropriate number of clusters for this data

In [None]:
from sklearn.cluster import KMeans
list1=[]
n_clusters = 10
for k in range(1,10):
    kmeans=KMeans(n_clusters=k,init='random',n_init=10,max_iter=300,random_state=0) 
    kmeans.fit(df)
    list1.append(kmeans.inertia_)



plt.plot(range(1,n_clusters), list1)
list1


### Plot the output of Kmeans, colored according to clusters and obtained centroids

In [None]:
k=KMeans(n_clusters=2,max_iter=300,init='random')
y=k.fit_predict(df)
print(y)
result=k.cluster_centers_
print(result)


### For each cluster plot the line from the centroid to the farthest point in the cluster.

In [None]:
plt.scatter(df['dur_eruptions'],df['wait_time_btn_eruptions'],c=y,s=50,cmap='autumn')
plt.grid()
plt.scatter(k.cluster_centers_[:,0],k.cluster_centers_[:,1],color='black',s=60)

### Find out all the points in the cluster which have a distance greater than the mean of the cluster, 

In [None]:
p0=np.array(df[y==0])
p1=np.array(df[y==1])


In [None]:
d0=0 
s0=0
for i in range(0,len(p0)):
    d1=np.linalg.norm(result[0]-p0[i])
    s0=s0+d1
#     print(d1)
#     print(s0)

mean=s0/len(p0)
# print(mean)
print("For 1st cluster")
for i in range(0,len(p0)):
    d1=np.linalg.norm(result[0]-p0[i])
    if d1>mean:
        farp0=p0[i]
        #print("Point {0} is greater ".format(p0[i]))
print(farp0)



In [None]:
d0=0 
s0=0
for i in range(0,len(p1)):
    d1=np.linalg.norm(result[1]-p1[i])
    s0=s0+d1
#     print(d1)
#     print(s0)

mean=s0/len(p1)
# print(mean)
print("For 2nd cluster")
for i in range(0,len(p1)):
    d1=np.linalg.norm(result[1]-p1[i])
    if d1>mean:
        farp1=p1[i]
        #print("Point {0} is greater ".format(p1[i]))
print(farp1)

### What is average distance of each cluster? (From the centroid to all points in the cluster)

In [None]:
x1,x2= result[0][0],farp0[0]
y1,y2= result[0][1],farp0[1]

result0=(((y2-y1)**2)+((x2-x1)**2))**0.5

print("Distance of cluster 1 is {0}".format(result0))

x1,x2= result[1][0],farp1[0]
y1,y2= result[1][1],farp1[1]

result1=(((y2-y1)**2)+((x2-x1)**2))**0.5

print("Distance of cluster 2 is {0}".format(result1))




### For the following points, find out which cluster they fall in?
### dur_eruptions = 2.5, 3.0, 3.2, 4.5, 5.0, 1.5
### wait_time_btn_eruptions = 70, 55, 70, 80, 70, 100
### Plot these points in separate markers & color along with the clusters

In [None]:
d={"duration":[2.5, 3.0, 3.2, 4.5, 5.0, 1.5],"waittime":[70, 55, 70, 80, 70, 100]}
d1=pd.DataFrame(data=d)
d1

m=k.predict(d1)
print(m)

d2=d1.drop(5)
d2

In [None]:
plt.scatter(df['dur_eruptions'],df['wait_time_btn_eruptions'],c=y,s=30,cmap='autumn')
plt.grid()
plt.scatter(k.cluster_centers_[:,0],k.cluster_centers_[:,1],color='black',s=60)


plt.scatter(d1["duration"],d1["waittime"],s=200,marker='+',cmap='autumn',c=m)
plt.axis([1.5,6,40,99])
plt.xlabel('dur_eruptions')
plt.ylabel('wait_time_btn_eruptions')
plt.title('dur_eruptions vs wait_time_btn_eruptions')
plt.grid()