In [1]:
%python

In [2]:
#Amazon S3 Integration

ACCESS_KEY = ############
SECRET_KEY = ################################
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = "dataminingoutput"
MOUNT_NAME = "mount2"

dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)

In [3]:
### data import ###

dataRDD = sc.textFile("/FileStore/tables/5dqc4cpq1481007336149/newdata.txt").map(lambda line: line.split("\t")).map(lambda (a,b): b).zipWithIndex().map(lambda (a,b): (b,a))
dataRDD.take(2)

In [4]:
### cleaning, tokenization and stopwords removal ###

import re
stopwords = sc.textFile("/FileStore/tables/u69wlumy1480742599455/stopwords_en.txt")
stop = stopwords.collect()

split_regex = r'\W+'

def tokenize(string):
  return [x for x in filter(lambda s: len(s) > 0, re.split(split_regex,string.lower())) if x not in set(stop)]

tokenized = dataRDD.map(lambda (a,b): (a,tokenize(b))).cache()
tokenized.take(2)

In [5]:
### Stemming ###

import nltk
from nltk.stem.porter import *
stemmer = PorterStemmer()

def stem(list):
  return [stemmer.stem(word) for word in list]

stemmed = tokenized.map(lambda (a,b): (a,stem(b))).cache()
stemmed.take(2)

In [6]:
### tf and tfidf function definitions ###
import math
from operator import add

def tf(tokens):
  tfDict = dict()
  for i in tokens:
    tfDict[i] = tfDict.setdefault(i, 0) + 1
  factor=1.0/sum(tfDict.values())
  for k in tfDict:
	    tfDict[k]=tfDict[k]*factor
  return tfDict

def idfs(corpus):
    N = corpus.count()
    def f(x): return x 
    def g(x): return len(x)
    uniqueTokens = corpus.flatMapValues(f).distinct().map(lambda (a,b): b)
    tokenSumPairTuple = uniqueTokens.map(lambda x: (x,1)).reduceByKey(add)
    return tokenSumPairTuple.map(lambda (a,b): (a,math.log(N/b)))
  
def tfidf(tokens, idfs):
    tfDict = tf(tokens)
    tfIdfDict = { k: tfDict.get(k, 0) * idfs.get(k, 0) for k in set(tfDict)  & set(idfs) }
    return tfIdfDict

In [7]:
### create the tf-tdf matrix and convert it to 2D numpy array (to input into clustering algorithms) ###

import pandas as pd
from numpy import *

idfsRDD = idfs(stemmed).cache()
idfsMap = idfsRDD.collectAsMap()

tokenslist = stemmed.collect()

#create list of tf-idf dictionaries
tfidfarray = []
for index,elem in enumerate(tokenslist):
    tfidfarray.append(tfidf(elem[1],idfsMap))
tfidfarray

#convert this list to numpy array and replace all NaNs with zeros
df = pd.DataFrame(tfidfarray)

dataArray = df.as_matrix()
where_are_NaNs = isnan(dataArray)
dataArray[where_are_NaNs] = 0
dataArray.shape

In [8]:
### PCA ### 
from sklearn.decomposition import PCA
from matplotlib import pyplot

#SCREE PLOT
pca = PCA(n_components=50)
pca.fit(dataArray)

ev = pca.explained_variance_
num = [i for i in range(len(ev))]

fig, ax = pyplot.subplots()
ax.plot(num,ev)
display(fig)

#PCA - with optimum number of components
pca2 = PCA(n_components=5)
pca2.fit(dataArray)
dataArray2 = pca2.fit_transform(dataArray)
dataArray2.shape

#PCA - 2 components for plotting
pca3 = PCA(n_components=2)
pca3.fit(dataArray)
dataArray3 = pca3.fit_transform(dataArray)
dataArray3.shape

In [9]:
### FUZZY C MEANS FUNCTION ###
import numpy as np
#euclidean distance function
def dist(x,y):   
    return np.sqrt(np.sum((x-y)**2))

#Parameters:    
# data -  data in the form of numpy array array (transposed tabular data)
# c - no of clusters
# m - fuzzy coefficient
# req_iter -  # iterations required, tol= error tolerance

def fuzzyc(data,c,m,req_iter,tol):
    #Fuzzy coefficient
    m=m

    #Cluster centers
    c=c

    #length of the dataset
    #l=len(data)
    l=len(data[1])

    # the power for the next membership funciton calc.
    exp=2/(m-1)

    #dimensions of the data
    dim=len(data)

    #no of iterations
    req_iter=req_iter

    #minimization tolerance
    tol=tol


    #random number generation for the initial membership matrix
    e = np.random.random((c,l))
    randtot=np.sum(e, axis=0)

    #new Uik calc.
    Uik=np.divide(e,randtot)
    Uikm=(Uik)**m

    # Loop for minimization function
    iterationcnt=0
    d=(dim,c)
    njmin=1
    jmin=0
    
    while (np.absolute(njmin-jmin)>tol and iterationcnt<req_iter):      
        #################### Iteration a ######################
        #center calc.
        nvi=np.zeros(d)
        for i in range(dim):
            nvi[i]=np.sum(np.multiply(data[i],Uikm),axis=1)/np.sum(Uikm,axis=1)

        #new membership matrix calc.
        dataT=data.T
        nviT=nvi.T
        size=(c,l)
        nUik=np.zeros(size)
        for j in range(c):
            for i in range(l):    
                xk_vi_dist=dist(dataT[i],nviT[j])
                denom=0
                for r in range(c):
                        xk_vr_dist=dist(dataT[i],nviT[r])
                        ratio=(xk_vi_dist/xk_vr_dist)**exp
                        denom=denom+ratio
        #new Uik is calc. here           
                nUik[j,i]=1/denom

        #new Uikm is calc. here        
        nUikm=(nUik)**m

        #minimization function calc.
        njm=np.zeros(size)
        for j in range(c):
            for i in range(len(dataT)):    
                njm[j,i]=(dist(dataT[i],nviT[j]))**2

        njmin=np.sum(np.multiply(njm,nUikm))

        #################### Iteration b ######################
        #center calc.
        vi=np.zeros(d)
        for i in range(dim):
            vi[i]=np.sum(np.multiply(data[i],nUikm),axis=1)/np.sum(nUikm,axis=1)

        #new membership matrix calc.
        dataT=data.T
        viT=vi.T
        size=(c,l)
        Uik=np.zeros(size)
        for j in range(c):
            for i in range(l):    
                xk_vi_dist=dist(dataT[i],viT[j])
                denom=0
                for r in range(c):
                        xk_vr_dist=dist(dataT[i],viT[r])
                        ratio=(xk_vi_dist/xk_vr_dist)**exp
                        denom=denom+ratio
        #new Uik is calc. here           
                Uik[j,i]=1/denom

        #new Uikm is calc. here        
        Uikm=(Uik)**m

        #minimization function calc.
        jm=np.zeros(size)
        for j in range(c):
            for i in range(len(dataT)):    
                jm[j,i]=(dist(dataT[i],viT[j]))**2

        jmin=np.sum(np.multiply(jm,Uikm))
        iterationcnt=iterationcnt+1
    ####################################################################################################

    #this will have the final centers
    newcenters=viT

    #This will have the allocation of datapoints to centers
    allocation=np.argmax(Uik.T,axis=1)
    
    #This will have SSE
    SSE=0
    for i in range(l):
        SSE=SSE+dist(newcenters[allocation[i]],data.T[i])
    
    #This will have SST
    SST=0
    for i in range(l):
        SST=SST+dist(data.T[i],np.mean(newcenters,0))
        
    #This will have the membership matix U
    mem_mat=Uik.T
    
    #FPC : varies from (1/c to 1)
    fpc=np.sum(Uik**2)/l

    #FuPC (normalized to 0 to 1)
    fpc_norm=(np.sum(Uik**2)/l-(1/c))/(1-(1/c))
    
    
    print("The vectors below in the order are")
    print("Allocation,centers,SSE,SST,mem_mat,fpc,fpc_norm,iterationcount")
    return allocation+1,newcenters,SSE,SST,mem_mat,fpc,fpc_norm,iterationcnt

In [10]:
### RUNNING THE ALGORITHM FOR THE REDUCED DATA ###

result = fuzzyc(dataArray2.T,8,2,1000,0.001)

In [11]:
### plotting the results ### 

fig, ax = pyplot.subplots()
ax.scatter(dataArray3[:,0], dataArray3[:,1], c=result[0])
display(fig)

In [12]:
### off the shelf fuzzy C means algorithm for comparison ###
import skfuzzy as fuzz

cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
        dataArray2.T, 8, 2, error=0.001, maxiter=1000, init=None)

In [13]:
#K means clustering
from scipy import cluster
from matplotlib import pyplot

initial = cluster.vq.kmeans(dataArray2,8)
cent, var = initial
assignment,cdist = cluster.vq.vq(dataArray2,cent)

In [14]:
### Intercluster distances for K means, fuzzy c means and off the shelf sk-fuzzy ###

kmeansintercluster = []
for j in range(len(cent)):
  kmeansintercluster.append([dist(cent[j],cent[i]) for i in range(len(cent))])

skfuzzyintercluster = []
for j in range(len(cntr)):
  skfuzzyintercluster.append([dist(cntr[j],cntr[i]) for i in range(len(cntr))])

fuzzyintercluster = []
for j in range(len(result[1])):
  fuzzyintercluster.append([dist(result[1][j],result[1][i]) for i in range(len(result[1]))])

In [15]:
### EXPORTING FILES TO ANALYZE RESULTS ###

def tf2(tokens):
  tfDict = dict()
  for i in tokens:
    tfDict[i] = tfDict.setdefault(i, 0) + 1
  #factor=1.0/sum(tfDict.values())
  #for k in tfDict:
	#    tfDict[k]=tfDict[k]*factor
  return tfDict

from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
MOUNT_NAME = "mount2"

#dataArray3
sqlCtx.createDataFrame(pd.DataFrame(dataArray3)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/dataArray3" % MOUNT_NAME)

#dataArray2
sqlCtx.createDataFrame(pd.DataFrame(dataArray2)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/dataArray2" % MOUNT_NAME)

#allocation
sqlCtx.createDataFrame(pd.DataFrame(result[0])).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/allocation" % MOUNT_NAME)

#mmat
sqlCtx.createDataFrame(pd.DataFrame(result[4])).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/mmat" % MOUNT_NAME)

#tdidf
sqlCtx.createDataFrame(df).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/tfidf" % MOUNT_NAME)

#just tfs, not tf-idfs
tfarray = []
for index,elem in enumerate(tokenslist):
    tfarray.append(tf2(elem[1]))
len(tfarray)

tf = pd.DataFrame(tfarray)
tf = tf.loc[:, ((tf.isnull().sum(axis=0)) <= 1550)]
tf = tf.fillna(0)
tf.shape
#tf.loc[1,:]

#tf
sqlCtx.createDataFrame(tf).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/tf" % MOUNT_NAME)

#mmat_skfuzzy
sqlCtx.createDataFrame(pd.DataFrame(u.T)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/mmat_skfuzzy" % MOUNT_NAME)

#cntr_skfuzzy
sqlCtx.createDataFrame(pd.DataFrame(cntr)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/cntr_skfuzzy" % MOUNT_NAME)

#centers
sqlCtx.createDataFrame(pd.DataFrame(result[1])).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/center" % MOUNT_NAME)

#Kmeans allocation
sqlCtx.createDataFrame(pd.DataFrame(assignment)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/kmeans_allocation" % MOUNT_NAME)

#Kmeans centroids
sqlCtx.createDataFrame(pd.DataFrame(cent)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/kmeans_centroid" % MOUNT_NAME)

#Kmeans intercluster dist
sqlCtx.createDataFrame(pd.DataFrame(kmeansintercluster)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/kmeans_intercluster" % MOUNT_NAME)

#skfuzzy intercluster dist
sqlCtx.createDataFrame(pd.DataFrame(skfuzzyintercluster)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/skfuzzy_intercluster" % MOUNT_NAME)

#fuzzy interclster dist
sqlCtx.createDataFrame(pd.DataFrame(fuzzyintercluster)).repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/%s/reuters/fuzzy_intercluster" % MOUNT_NAME)