## K-means clustering

http://brandonrose.org/clustering

https://www.youtube.com/watch?v=ORpDAUQUnkU

In [173]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction

In [252]:
df = pd.read_csv('mbti.csv', index_col=None) 
df

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [253]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
type     8675 non-null object
posts    8675 non-null object
dtypes: object(2)
memory usage: 135.7+ KB


In [254]:
nltk.download('stopwords')


REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|')
URL_RE = re.compile('(\w+:\/\/\S+)|^rt|http.+?')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() 
    text = text.replace("|||","\n")
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [255]:
df['posts'] = df['posts'].apply(clean_text)
df

Unnamed: 0,type,posts
0,INFJ,enfp intj moments sportscenter top ten plays p...
1,ENTP,finding lack posts alarming sex boring positio...
2,INTP,good one course say know blessing curse absolu...
3,INTJ,dear intp enjoyed conversation day esoteric ga...
4,ENTJ,fired another silly misconception approaching ...
...,...,...
8670,ISFP,ixfp always think cats fi doms reason especial...
8671,ENFP,thread already exists someplace else heck dele...
8672,INTP,many questions things would take purple pill p...
8673,INFP,conflicted right comes wanting children honest...


In [256]:
df["EI"] = [row["type"][0] for _,row in df.iterrows()]
df

Unnamed: 0,type,posts,EI
0,INFJ,enfp intj moments sportscenter top ten plays p...,I
1,ENTP,finding lack posts alarming sex boring positio...,E
2,INTP,good one course say know blessing curse absolu...,I
3,INTJ,dear intp enjoyed conversation day esoteric ga...,I
4,ENTJ,fired another silly misconception approaching ...,E
...,...,...,...
8670,ISFP,ixfp always think cats fi doms reason especial...,I
8671,ENFP,thread already exists someplace else heck dele...,E
8672,INTP,many questions things would take purple pill p...,I
8673,INFP,conflicted right comes wanting children honest...,I


In [257]:
df["NS"] = [row["type"][1] for _,row in df.iterrows()]
df

Unnamed: 0,type,posts,EI,NS
0,INFJ,enfp intj moments sportscenter top ten plays p...,I,N
1,ENTP,finding lack posts alarming sex boring positio...,E,N
2,INTP,good one course say know blessing curse absolu...,I,N
3,INTJ,dear intp enjoyed conversation day esoteric ga...,I,N
4,ENTJ,fired another silly misconception approaching ...,E,N
...,...,...,...,...
8670,ISFP,ixfp always think cats fi doms reason especial...,I,S
8671,ENFP,thread already exists someplace else heck dele...,E,N
8672,INTP,many questions things would take purple pill p...,I,N
8673,INFP,conflicted right comes wanting children honest...,I,N


In [258]:
df["FT"] = [row["type"][2] for _,row in df.iterrows()]
df

Unnamed: 0,type,posts,EI,NS,FT
0,INFJ,enfp intj moments sportscenter top ten plays p...,I,N,F
1,ENTP,finding lack posts alarming sex boring positio...,E,N,T
2,INTP,good one course say know blessing curse absolu...,I,N,T
3,INTJ,dear intp enjoyed conversation day esoteric ga...,I,N,T
4,ENTJ,fired another silly misconception approaching ...,E,N,T
...,...,...,...,...,...
8670,ISFP,ixfp always think cats fi doms reason especial...,I,S,F
8671,ENFP,thread already exists someplace else heck dele...,E,N,F
8672,INTP,many questions things would take purple pill p...,I,N,T
8673,INFP,conflicted right comes wanting children honest...,I,N,F


In [283]:
df["PJ"] = [row["type"][3] for _,row in df.iterrows()]
df

Unnamed: 0,type,posts,EI,NS,FT,PJ,cluster
0,INFJ,enfp intj moments sportscenter top ten plays p...,I,N,F,J,0
1,ENTP,finding lack posts alarming sex boring positio...,E,N,T,P,0
2,INTP,good one course say know blessing curse absolu...,I,N,T,P,0
3,INTJ,dear intp enjoyed conversation day esoteric ga...,I,N,T,J,0
4,ENTJ,fired another silly misconception approaching ...,E,N,T,J,0
...,...,...,...,...,...,...,...
8670,ISFP,ixfp always think cats fi doms reason especial...,I,S,F,P,0
8671,ENFP,thread already exists someplace else heck dele...,E,N,F,P,0
8672,INTP,many questions things would take purple pill p...,I,N,T,P,0
8673,INFP,conflicted right comes wanting children honest...,I,N,F,P,0


In [260]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [261]:
documents = df["posts"].values.astype("U")

In [262]:
vectorizer = TfidfVectorizer(stop_words="english")
features = vectorizer.fit_transform(documents)

In [266]:
k = 2
model = KMeans(n_clusters=k,init="k-means++",max_iter=100,n_init=1)
model.fit(features)

KMeans(max_iter=100, n_clusters=2, n_init=1)

In [267]:
df["cluster"] = model.labels_

In [268]:
df.head()

Unnamed: 0,type,posts,EI,NS,FT,PJ,cluster
0,INFJ,enfp intj moments sportscenter top ten plays p...,I,N,F,I,0
1,ENTP,finding lack posts alarming sex boring positio...,E,N,T,E,0
2,INTP,good one course say know blessing curse absolu...,I,N,T,I,0
3,INTJ,dear intp enjoyed conversation day esoteric ga...,I,N,T,I,0
4,ENTJ,fired another silly misconception approaching ...,E,N,T,E,0


In [278]:
def calculate_score(column,reverse):
    count = 0
    if(reverse):
        a = column[1]
        b = column[0]
    else:
        a = column[0]
        b = column[1]
    for i,row in df.iterrows():
        if((row[column]==a) and (row.cluster==1)):
            count += 1
        elif((row[column]==b) and (row.cluster==0)):
            count += 1
    return count/(i+1)

In [279]:
def score(column):
    a = calculate_score(column,False)
    b = calculate_score(column,True)
    if(a>b):
        x = column[0]
        y = column[1]
        score = a
    else:
        x = column[1]
        y = column[0]
        score = b
    print(f"accuracy with ({x},{y}) = (1,0): {score}")

In [284]:
score("EI")
score("NS")
score("FT")
score("PJ")

accuracy with (E,I) = (1,0): 0.7636887608069164
accuracy with (S,N) = (1,0): 0.8563688760806917
accuracy with (T,F) = (1,0): 0.5398270893371758
accuracy with (J,P) = (1,0): 0.6026512968299712
