In [32]:
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import lru_cache
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import mixture
import matplotlib.pyplot as plt

print("imported ")

imported 


In [33]:

# initializing lemmatizer
stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()
lemmatize = lru_cache(maxsize=50000)(wordnet_lemmatizer.lemmatize)
 
# ===========helper methods ========================================
def remove_non_ascii(s):
    return "".join(i for i in s if ord(i) < 128)

def stop_words_list():
    """
        A stop list specific to the observed timelines composed of noisy words
        This list would change for different set of timelines
    """
    stop_words = ['bc', 'http', 'https', 'co', 'rt', 'one', 'us', 'new',
              'lol', 'may', 'get', 'want', 'like', 'love', 'no', 'thank', 'would', 'thanks',
              'via', 'today', 'gt', 'great', 'watch', 'watched', 'season',
              '00p', 'roger']

    stoplist  = set( nltk.corpus.stopwords.words("english") + stop_words)
    return stoplist

def remove_urls(text):
    text = re.sub(r"(?:\@|http?\://)\S+", "", text)
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    return text

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return None

def tokenize(text):
    """
    helper function to readTweets() removes url and tokenizes text
    :param text
    """
    text = remove_urls(text)
    text = remove_non_ascii(text)
    text = re.sub(r"""[\'\"]""",'', text)
    regexps = (
        r"""(?:[\w_]+)""",                          # regular word
        r"""(?:[a-z][a-z'\-_]+[a-z])"""             # word with an apostrophe or a dash
    )
    tokens_regexp = re.compile(r"""(%s)""" % "|".join(regexps),
                               re.VERBOSE | re.I | re.UNICODE)
    return tokens_regexp.findall(text)

def replace_acronym(tokens, slang_dict):
    new_tokens = []
    for token in tokens:
        if token in slang_dict:
            new_tokens.extend(slang_dict[token].split())
        else:
            new_tokens.append(token)
    return new_tokens

def tokenize_and_lemmatize(text, slang_dict, stop_words):
    # get the tokens, lowercase - replace acronym
    lowered = [item.lower() for item in tokenize(text)]
    tokens = replace_acronym(lowered, slang_dict)

    
    tokens_pos = pos_tag(tokens)
    words = []
    for token in tokens_pos:
        pos = get_wordnet_pos(token[1])
        # if verb, noun, adj or adverb include them after lemmatization
        if pos is not None and token[0] not in stop_words:
            try:
                tok = lemmatize(token[0], pos)
                words.append(tok)
            except UnicodeDecodeError:
                pass
    # print words
    return words

def read_in_dict(filename):
    dict = {}
    with open(filename) as f:
        for line in f.readlines():
            parts = line.partition(":")
            dict[parts[0].strip()] = parts[2].strip()
    return dict



In [38]:
test_df = pd.read_csv("data/test.csv",  low_memory=False)
print(test_df.shape)
# drop the rows with no tweet lists
test_df.dropna(subset=['tweet'], inplace=True)
print(test_df.shape)


(349085, 5)
(348611, 5)


Reading the data set

In [35]:
print(test_df[:5])

  Unnamed: 0                                              tweet  tweets_count  \
0          0  Check out this musical: https://t.co/ZIzIikRjf...         129.0   
1          1          LIVE on #YouNow - https://t.co/AIKnwfKWUX         129.0   
2          2          LIVE on #YouNow - https://t.co/3kRLkTdEmG         129.0   
3          3          LIVE on #YouNow - https://t.co/g5o35gU0L3         129.0   
4          4          LIVE on #YouNow - https://t.co/f1T36k9k0q         129.0   

      user_id     user_name  
0  2994953628  obeyy_hurr13  
1  2994953628  obeyy_hurr13  
2  2994953628  obeyy_hurr13  
3  2994953628  obeyy_hurr13  
4  2994953628  obeyy_hurr13  


In [39]:
# convert the pandas series (tweet text) to numpy array
text = test_df.iloc[:, 1]
tweet_text = np.array(text)
print (len(tweet_text))

348611


Take the column of tweet text and convert to numpy array, for use in tfidf

In [None]:
def get_tfidf_model(texts, stop_words= None, slang_dict=None):
    stop_words = stop_words_list()
    vectorizer = TfidfVectorizer(tokenizer=lambda text: tokenize_and_lemmatize(text, slang_dict, stop_words),
                                 stop_words=None,
                                 max_df=0.9,
                                 min_df=5,
                                 lowercase=True,
                                 decode_error='ignore',
                                 )

    #print ("vocabulary_: ", vectorizer.vocabulary_)
    
    tfidf_model = vectorizer.fit_transform(texts)
    return tfidf_model, vectorizer
    

slang_dict = read_in_dict("data/out_slang_map.csv")
tfidf_model, vectorizer = get_tfidf_model(tweet_text, stop_words_list(), slang_dict)

print(tfidf_model.shape)

In [None]:
#store the tfidf
with open("tfidf/unlabelled.pkl", 'wb') as handle:
    pickle.dump(tfidf_model, handle)

In [None]:
#load the content
tfidf_model = pickle.load(open("tfidf/unlabelled.pkl", "rb" ) )

In [None]:
# gmm maximisation
clf = mixture.GMM(n_components=10, n_iter=500, n_init=10)
print (clf)

Aikaki Information Criteria(AIC) and the Bayesian Information Criterion(BIC)

In [None]:
n_estimators = np.arange(2,200, step=5)
tfarray = tfidf_model.toarray()
clfs = [mixture.GMM(n, n_iter=500).fit(tfarray) for n in n_estimators]
bics = [clf.bic(tfarray) for clf in clfs]
aics = [clf.aic(tfarray) for clf in clfs]

plt.plot(n_estimators, bics, label= 'BIC')
plt.plot(n_estimators, aics, label= 'AIC')
plt.legend()


In [None]:
n_estimators = np.arange(2,50, step=2)
tfarray = tfidf_model.toarray()
clfs = [mixture.GMM(n, n_iter=500).fit(tfarray) for n in n_estimators]
bics = [clf.bic(tfarray) for clf in clfs]
aics = [clf.aic(tfarray) for clf in clfs]

plt.plot(n_estimators, bics, label= 'BIC')
plt.plot(n_estimators, aics, label= 'AIC')
plt.legend()

In [None]:
n_estimators = np.arange(2,20, step=1)
tfarray = tfidf_model.toarray()
clfs = [mixture.GMM(n, n_iter=500).fit(tfarray) for n in n_estimators]
bics = [clf.bic(tfarray) for clf in clfs]
aics = [clf.aic(tfarray) for clf in clfs]

plt.plot(n_estimators, bics, label= 'BIC')
plt.plot(n_estimators, aics, label= 'AIC')
plt.legend()

i = 10
print("n             bic              aic ")
for x, y in zip(bics, aics):
    i += 1
    print("%d             %d              %d" % (i, x, y) )

<hr>
Now , as pper GMM - we know the best fit with least dimentions is obtained at clusters = 12
Lets run kmeans 


In [None]:
# run k means :
clusters = 12
km_model = KMeans(n_clusters=clusters)
cluster = km_model.fit_predict(tfidf_model)

# result analysis :
print("Silhouette score                   : %f" % silhouette_score(tfidf_model, cluster))


In [None]:
order_centroids = km_model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

In [None]:
# ==============================
# User Similarity
# ==============================

'''
    given a pandas dataframe with user_id, tweet, cluster_label, caculate pairwirse similarity between users.
    
'''

test_df.loc[:,'cluster'] = pd.Series(cluster)
print(test_df[:50])