In [3]:
import tweepy
import pandas as pd
from collections import namedtuple, Counter
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

In [4]:
# set up Twitter credentials using tweepy 

auth = tweepy.OAuthHandler('UEALJgD2o5lwpOeAeNhb9ceWX', 'JqB4i0TIqRizGtp97BovQN5iQDgA5BpF5tHHL0nZkUEANjxJuV')
auth.set_access_token('732602583689367552-KeUxTRNYO2XwFux13Tp1yLoKyylMyO7', '0WWtEXmjOV1g732CBPs9Te8xQa7VKuSIBP9A0J4dxm1mL')

api = tweepy.API(auth)

In [5]:
# use named tuples to keep info of interest for each tweet; keep urls to remove from text later

Tweet = namedtuple('Tweet', ['text', 'tags', 'cited', 'urls', 'author'])

In [6]:
# utility functions to collect info of interest

def get_hashtags(tweet):
    tags = []
    if len(tweet['entities']['hashtags']) > 0:
        for tag in tweet['entities']['hashtags']:
            tags.append(tag['text'])
    return tags

def get_citations(tweet):
    citations = []
    if len(tweet['entities']['user_mentions']) > 0:
        for mention in tweet['entities']['user_mentions']:
            citations.append(mention['screen_name'])
    return citations

def get_urls(tweet):
    urls = []
    if len(tweet['entities']['urls']) > 0:
        for url in tweet['entities']['urls']:
            urls.append(url['url'])
    return urls
           

In [7]:
# pull in tweets for each influencer listed in influencers.txt and process most recent 500 that are not retweets
# store as lists in a dict with the influencer handles as keys 

all_tweets = {}
with open('influencers.txt', 'r') as names:
    for raw_name in names:
        name = raw_name.strip()
        tweets = tweepy.Cursor(api.user_timeline, id=name, tweet_mode='extended').items(800)
        keepers = [tweet._json for tweet in tweets if 'retweeted_status' not in tweet._json]
        all_tweets[name] = [Tweet(tweet['full_text'], get_hashtags(tweet), get_citations(tweet), get_urls(tweet),
                                  name) for tweet in keepers[:500]]    

names = all_tweets.keys()



KeyboardInterrupt: 

In [None]:
# also get the description each person uses

descriptions = {}
for name in names:
    descriptions[name] = api.get_user(name)._json['description']

In [None]:
# save data in case kernel hangs or is stopped

with open('twitter_data.pydat', 'wb') as datafile:
    pickle.dump(all_tweets, datafile)

with open('twitter_descriptions.pydat', 'wb') as datafile:
    pickle.dump(all_tweets, datafile)

In [None]:
# create dict of Counters for the hashtags used by each person

all_tags = {}
for name in names:
    tags = [tweet.tags for tweet in all_tweets[name]]
    all_tags[name] = Counter(list(itertools.chain(*tags)))

In [None]:
# create dict of Counters for the mentions made by each person

all_mentions = {}
for name in names:
    mentions = [tweet.cited for tweet in all_tweets[name]]
    all_mentions[name] = Counter(list(itertools.chain(*mentions)))

In [None]:
# create Counter for all the hashtags used

tags = Counter()
for name in names:
    tags += all_tags[name]

In [None]:
len(list(tags))

In [None]:
# collect all hashtags used by more than one influencer for use as features
# should have done this with the hashtags too

tag_features = [tag for tag in list(tags.keys()) if sum([0 if all_tags[name][tag] == 0 else 1 for name in names]) > 1]
len(tag_features)

In [None]:
# create Counter for all the mentions made

mentions = Counter()
for name in names:
    mentions += all_mentions[name]

In [None]:
len(list(mentions))

In [None]:
# collect all mentions made by more than one influencer for use as features

mention_features = [mention for mention in list(mentions.keys()) if sum([0 if all_mentions[name][mention] == 0 else 1
                                                                         for name in names]) > 1]
len(mention_features)

In [None]:
# find duplicates between tags and mentions

duplicates = list(set(tag_features) & set(mention_features))
len(duplicates)

In [None]:
# remove from mentions since there are more of them
# Note: this turned out to be unnecssary because tokenization split off the '#' but not the '@'

mention_features = [mention for mention in mention_features if mention not in duplicates]
len(mention_features)

In [None]:
# create feature vectors comprising the counts for each tag and mention feature

entities = pd.DataFrame(index=names, columns=(tag_features + mention_features))
for name in names:
    for tag in tag_features:
        entities.loc[name, tag] = all_tags[name][tag]
    for mention in mention_features:
        entities.loc[name, mention] = all_tags[name][mention]

In [None]:
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, SpectralClustering, DBSCAN
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [None]:
# normalize the features and perform a 2-component PCA analysis to use for graphing

normed_ents = normalize(entities)

In [None]:
ents_pca = PCA(2).fit(normed_ents)
ents_pca.explained_variance_ratio_

### Try basic mean shift clustering

In [None]:
ms = MeanShift()
ms.fit(normed_ents)
ms.labels_

Mean shift did not produce usable results, merely clustering all the authors together except for four clusters of one person each.

### Try k-means clustering

In [None]:
# using silhouette scores over 1,000 runs to find the optimal number of clusters, then the best clusters

clusters = []
for _ in range(1000):
    best_score = 0
    num_labels = 0
    for num_clusters in range(2, 7):
        km = KMeans(num_clusters)
        km.fit(normed_ents)
        sil_score = silhouette_score(normed_ents, km.labels_)
        if sil_score > best_score:
            best_score = sil_score
            num_labels = len(np.unique(km.labels_))
    clusters.append(num_labels)

optimal_num_clusters = Counter(clusters).most_common(1)[0][0]
print('The optimal number of clusters is {}'.format(optimal_num_clusters))
      
best_labels = []
best_score = 0

for _ in range(1000):
    km = KMeans(optimal_num_clusters)
    km.fit(normed_ents)
    sil_score = silhouette_score(normed_ents, km.labels_)
    if sil_score > best_score:
        best_score = sil_score
        best_labels = km.labels_

for i in range(optimal_num_clusters):
    print('Cluster {} comprises {}'.format(i, [name for ix, name in enumerate(names) if best_labels[ix] == i]))

In [None]:
Counter(clusters).most_common()

The algorithm generated one cluster of authors with very few hashtags and mentions, and another comprising everyone else. This is a valid but not very sophisticated breakdown. The only really interesting result is that one author with a relatively high number of tags and mentions (EvanSinar, with 665) was put in the group with low numbers. Apparently he had few of these entities in common with others, resulting in low similarity scores as did authors with few entities.

In [None]:
for name in names:
    print('{:15}: {} entities'.format(name, entities.loc[name].sum()))

The Twitter account descriptions are not very helpful for discovering similarities. Most are very similar.

In [None]:
groups = pd.DataFrame(index=names, columns=['description', 'label'])
for name in names:
    groups.loc[name, 'description'] = descriptions[name]
groups.loc[:, 'label'] = best_labels

In [None]:
for label in np.unique(best_labels):
    for name in names:
        if groups.loc[name, 'label'] == label:
            print(name, ':  ', groups.loc[name, 'description'], '\n')
    print('\n\n')

### Plot the clusters against the two principal components for a simple visualization.

In [None]:
viz = ents_pca.transform(entities)
viz = pd.DataFrame(viz, index=names, columns=['pca0', 'pca1'])
viz['label'] = best_labels

In [None]:
cmap = sns.cubehelix_palette(n_colors=2, rot=-0.7)
sns.relplot(x='pca0', y='pca1', hue='label', data=viz, palette=cmap)

Although the plot utiizes only the first two components, which combine to explain less than half of the variance, and a few points are not individually visible since they are too close together, even here the clusters can be seen to comprise a large group occupying most of the plot and a group of outliers. With the exception of EvanSinar, the outlying cluster groups together people who included zero or very few hashtags and mentions common to others in their tweets.

In [None]:
viz['num_entities'] = [sum(list(entities.iloc[ix])) for ix, name in enumerate(names)]

viz.sort_values(by='num_entities')

The optimal KMeans grouping scheme of 2 clusters divides the authors into a group with very few common hashtags and mentions, and another group comprising authors with many common hashtags and mentions. EvanSinar is the exception, having 665 common hashtags and mentions but appearing in the group with very few of these. This might represent one network comprising 9 members and 6 people whose networks do not intersect with each other or those in the 9-member group, but the few-entities group also might comprise mostly authors whose style is to not use many hashtags or mentions. It would be interesting to look at the 6-cluster groupings, but this question cannot be settled given the data. Without all of the authors using a good number of common hashtags and/or mentions, clustering on these features will not produce useful results. Clustering on only those authors who do use large numbers of these entities could produce insights.

### Try spectral clustering

In [None]:
sc = SpectralClustering(n_clusters=4)
sc.fit(entities)

sc_labels = sc.labels_
sc_labels

The warning arises from the fact that two of the authors have entity vectors that are all zeroes. This results in the graph being not fully connected. Forcing the algorithm to create four clusters produces one large cluster, two 2-person clusters and a 1-person cluster. At this point, the reults taken together indicate that clustering on the hashtags and mentions is not very useful.

## Try clustering using tf-idf vectorization

In [None]:
import re

In [None]:
# gather the tweet texts as elements of a list in preparation for tf-idf vectorization
# create a dataframe with a list for each author, for clustering, and one with indidvidual tweets and their authors

tweet_texts = []
tweet_authors = []
tweets_by_author = pd.DataFrame(index=names, columns=['combined'])


for name in names:
    tweets = [re.sub('https?:\/\/[-\w.]\.?[a-zA-Z0-9]+\/?[a-zA-Z0-9]*', '', tweet.text).strip()
              for tweet in all_tweets[name]]
    tweets_by_author.loc[name, 'combined'] = ' '.join(tweets)
    tweet_texts.extend(tweets)
    tweet_authors.extend([name] * len(tweets))
        

In [None]:
tweets_df = pd.DataFrame({'text': tweet_texts, 'author': tweet_authors})

In [None]:
# reality check: the tweets in the dataframe should change authors at a multiple of 500

tweets_df.loc[995:1005]

Apply tf-idf vectorization. Tweets are compact and rely on impact words more than grammatical constructions, so vocabulary seems more appropriate than tokenization and analysis involving parts of speech and phraseology. A question to be settled is whether individual tweets are too short to produce meaningful vectorizations, in which case groupings of several tweets might produce better results.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [None]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [None]:
auth_vectors = vectorizer.fit_transform(tweets_by_author['combined'])
terms = vectorizer.get_feature_names()
cluster_df = pd.DataFrame(auth_vectors.toarray(), index=names, columns=terms)

### Re-run the clustering algorithms on the vectorized combined tweets

In [None]:
ms.fit(normalize(cluster_df))
ms.labels_

In [None]:
clusters = []
cluster_data = normalize(cluster_df)
for _ in range(1000):
    best_score = -999
    num_labels = 0
    for num_clusters in range(5, 10):
        km = KMeans(num_clusters)
        km.fit(cluster_data)
        sil_score = silhouette_score(cluster_data, km.labels_)
        if sil_score > best_score:
            best_score = sil_score
            num_labels = num_clusters
    clusters.append(num_labels)

optimal_num_clusters = Counter(clusters).most_common(1)[0][0]
print(Counter(clusters).most_common())
print('The optimal number of clusters is {}'.format(optimal_num_clusters))
      
best_labels1 = []
best_score1 = -999
centers1 = []

for _ in range(1000):
    km = KMeans(optimal_num_clusters)
    km.fit(cluster_data)
    sil_score = silhouette_score(cluster_data, km.labels_)
    if sil_score > best_score1:
        best_score1 = sil_score
        best_labels1 = km.labels_
        centers1 = km.cluster_centers_

for i in range(optimal_num_clusters):
    print('Cluster {} comprises {}'.format(i, [name for ix, name in enumerate(names) if best_labels1[ix] == i]))

In [None]:
# reduce feature space to 5 principal components and choose two with the greatest spread among authors for graphing
svd = TruncatedSVD(5)
cluster_lsa = svd.fit_transform(cluster_data)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components: ", total_variance*100)

authors_by_component = pd.DataFrame(cluster_lsa, index=names)
for i in range(5):
    print('Component {}:'.format(i))
    print(authors_by_component.loc[:,i].sort_values(ascending=False)[0:15])

In [None]:
authors_by_component['label'] = best_labels1
cmap = sns.color_palette('cubehelix', 7)
ax = sns.scatterplot(data=authors_by_component, x=1, y=4, hue='label', palette=cmap)
ax.set_title('Clusters plotted against 2 of 5 principal components')
ax.set_ylabel('lsa4')
ax.set_xlabel('lsa1')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

The clusters are fairly well segregated on a 2-component plot. Look for commonalities in the 10 highest-scoring terms for each author. 

In [None]:
words = cluster_df.T
groupings = pd.DataFrame(index=names, columns=range(10))
for name in names:
    groupings.loc[name, :] = words[name].sort_values(ascending=False)[0:10].index
groupings['label'] = best_labels1
groupings.sort_values(by='label')

It's hard to see much in common among the members of each cluster, but both members of cluster 2 talk about leadership. All three authors in cluster 3 highlight deep learning, and the two cluster 6 autors have a shared interest in 5g and applied tech in general.  The other groups have no obvious commonalities among members (clusters 1 and 5) or are individuals (clusters 0 and 4).

Look for words closest to the centroid of each cluster that score as high-impact words for cluster members, and see which of these words all members have in common.

In [None]:
centroids1 = centers1.argsort()[:, ::-1] 
words_df = pd.DataFrame(cluster_data, index=names, columns=terms)
print('High-impact words close to the centroid and common to all members in each cluster\n\n')
for i in range(optimal_num_clusters):
    print('Cluster {}:'.format(i))
    people = [name for name in names if groupings.loc[name, 'label'] == i]
    words = []
    for ind in centroids1[i]:
        word = terms[ind]
        common = True
        for person in people:
            if words_df.loc[person, word] <= 0.03: # the .03 threshold was obtained by trying various values
                common = False
                break
        if common == True:
            words.append(word)
        if len(words) == 20:
            break
    print('\n', words, '\n\n')

This programmatic analysis of the intra-group comonalities confirms and expands the insights from visual inspection of the top 10 words for each author.
- Clusters 0 and 4, individuals, have large numbers of words, because neither has another person to filter for common words.
- The words from cluster 1 words reveal no common topic of interest, but these are the authors who used few or no hashtags and mentions. So there is a logic to the grouping.
- The cluster 2 authors both used the terms 'leadership' and 'workplace,' showing perhaps a shared interest in management.
- For cluster 3, the authors had 'neuralnetworks' as well as 'deeplearning' in common, strengthening the topical bond.
- There is a surprise in that the cluster 5 authors share many terms in common. Rather than topics, however, they seem to comprise mostly mentions and groups, suggesting these authors have overlapping work and/or social networks.
- The cluster 6 authors shared 'edgecomputing' as well as '5g,' but also 'oracle,' enterprise software. These seem to suggest an applications-oriented focus.

*__It should be noted__*, however, that a number of the common influential "words" are joined phrases, indicating that they likely were hashtags (or mentions). This contaminates the results, since, for instance, any number of authors might have written about 'deep learning,' even to the point where 'deep' and 'learning' on their own ended up with very low scores, or maybe were even excluded because they occurred too frequently. This seems an intractable problem with the tf-idf approach employed. A bag-of-words approach would have the converse problem. 

The thematic topic(s) of each cluster can be analyzed by looking at the terms closest to its centroid, regardless of which members used those terms or had them in common.

In [None]:
print('Words closest to the centroid of each cluster\n\n')
for i in range(optimal_num_clusters):
    print('Cluster {}:'.format(i))
    words = [terms[ind] for ind in centroids1[i][:10]]
    print(words, '\n\n')

These results solidify the already established themes. Cluster 2 has 'hr' and 'hranalytics,' in addition to 'leadership' and 'workplace,' further emphasizing management concerns, but it also has 'odsc,' 'london,' '19,' 'keynote' and 'speaker,' suggesting the two authors in this group were strongly involved with the ODSC Europe conference held Sept. 19-22 this year in London. gains a sharper enterprise focus with the addition of 'sassoftware,' 'devops' and 'dataanalytics.' For cluster 3, 'datascientists,' 'digitaltransformation' and 'kdnuggets' extend the AI/machine learning emphasis of deep learning and neural networks. Similarly, 'emergingtech,' 'sassoftware,' 'dataanalytics' and 'devops' fit with the cluster 6 emphasis on applied data sciece and emerging technologies. The fact that there is little overlap between the central terms of clusters 1 and 5 and the high-scoring terms, respectively, of their members supports the idea that the connections here have more to do with things beside topical content — low use of hashtags and mentions for cluster 1 and network relationships for cluster 5. 

In [None]:
sc = SpectralClustering(n_clusters=optimal_num_clusters)

best_score = -999
best_sc_labels = []
for _ in range(1000):
    sc.fit(cluster_data)
    sil_score = silhouette_score(cluster_data, sc.labels_)
    if sil_score > best_score:
        best_score = sil_score
        best_sc_labels = sc.labels_


print(sc_labels, '\n')
for i in range(7):
    print('Cluster {} comprises {}'.format(i, [name for ix, name in enumerate(names) if sc_labels[ix] == i]))

These results are not as observably coherent as those from K-Means.

## Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets_df['text'], tweets_df['author'], test_size=0.4,
                                                    random_state=0, stratify=tweets_df['author'])


# apply the vectorizer, but don't fit on the test set to prevent data leakage
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

mcc_scorer = make_scorer(matthews_corrcoef)

Evaluate performance with cross-validation scoring of the basic estimators, using the Matthews correlation coefficient, which uses all the metrics from the confusion matrix. See if classification into clusters (7 classes) produces significantly better results thn classifying by individual authors (15 classes).

In [None]:
rfc = RandomForestClassifier(max_depth=20)

print(cross_val_score(rfc, X_train_tfidf, y_train, scoring=mcc_scorer, cv=4))

In [None]:
rfc.fit(X_train_tfidf, y_train)
print(matthews_corrcoef(y_test, rfc.predict(X_test_tfidf)))

In [None]:
# create cluster targets

groupings = dict(zip(names, best_labels))
y_groupings_train = y_train.apply(lambda x: groupings[x])
y_groupings_test = y_test.apply(lambda x: groupings[x])

In [None]:
print(cross_val_score(rfc, X_train_tfidf, y_groupings_train, scoring=mcc_scorer, cv=4))

In [None]:
gbc = GradientBoostingClassifier()

print(cross_val_score(gbc, X_train_tfidf, y_train, scoring=mcc_scorer, cv=4))

In [None]:
print(cross_val_score(gbc, X_train_tfidf, y_groupings_train, scoring=mcc_scorer, cv=4))

In [None]:
lrc = LogisticRegression()

print(cross_val_score(lrc, X_train_tfidf, y_train, scoring=mcc_scorer, cv=4))

In [None]:
print(cross_val_score(lrc, X_train_tfidf, y_groupings_train, scoring=mcc_scorer, cv=4))

In [None]:
knc = KNeighborsClassifier(n_neighbors=10)

print(cross_val_score(knc, X_train_tfidf, y_train, scoring=mcc_scorer, cv=4))

In [None]:
ada_rf = AdaBoostClassifier(rfc)

print(cross_val_score(ada_rf, X_train_tfidf, y_train, scoring=mcc_scorer, cv=4))

In [None]:
ada_lr = AdaBoostClassifier(lrc)
print(cross_val_score(ada_lr, X_train_tfidf, y_train, scoring=mcc_scorer, cv=4))

Using cluster labels instead of individuals as the targets did not significantly improve the results, despite reducing the number of classes by half. The one exception to this was with gradient boosting, where the score increased by about 10%. Does it overfit?

In [None]:
gbc.fit(X_train_tfidf, y_train)
print(matthews_corrcoef(y_test, gbc.predict(X_test_tfidf)))

In [None]:
gbc.fit(X_train_tfidf, y_groupings_train)
print(matthews_corrcoef(y_groupings_test, gbc.predict(X_test_tfidf)))

Gradient boosting is not overfitting. How about logistic regression, which had similar performance? 

In [None]:
lrc.fit(X_train_tfidf, y_train)
print(matthews_corrcoef(y_test, lrc.predict(X_test_tfidf)))

In [None]:
lrc.fit(X_train_tfidf, y_groupings_train)
print(matthews_corrcoef(y_groupings_test, lrc.predict(X_test_tfidf)))

Neither of the two best-performing algorithms is overfitting. See if lemmatizing the tweets improves performance

In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

In [None]:
def tokenize(input):
    tokens = nlp(input)
    output = [token.lemma_ for token in tokens
              if token.lemma_ not in ['#', ",'", ',"', ".'", '."', "?'", '?"']
              and not (token.is_punct and len(token.lemma_) == 1) # keep two-character emoticons, e.g. :) or ;) 
              and not token.is_stop]
    return ' '.join(output).strip()

In [None]:
X_train_tokenized = X_train.apply(tokenize)
X_test_tokenized = X_test.apply(tokenize)

In [None]:
X_train_tok_tfidf = vectorizer.fit_transform(X_train_tokenized)
X_test_tok_tfidf = vectorizer.transform(X_test_tokenized)

In [None]:
print(cross_val_score(lrc, X_train_tok_tfidf, y_train, scoring=mcc_scorer, cv=4))

In [None]:
print(cross_val_score(lrc, X_train_tok_tfidf, y_groupings_train, scoring=mcc_scorer, cv=4))

In [None]:
print(cross_val_score(gbc, X_train_tok_tfidf, y_train, scoring=mcc_scorer, cv=4))

In [None]:
print(cross_val_score(gbc, X_train_tok_tfidf, y_groupings_train, scoring=mcc_scorer, cv=4))

No significant improvement, and the cross-eval scores for gradient boosting are a little less consistent. At this point, I will choose logistic regression over gradient boosting, because performance is on a par and gradient boosting consumes much more compute resources, taking orders of magnitude longer. See if adding bigrams improves performance.

In [None]:
len(terms)

In [None]:
vectorizer2 = TfidfVectorizer(ngram_range=(1,2), 
                              max_df=0.5, # drop words that occur in more than half the paragraphs
                              min_df=2, # only use words that appear at least twice
                              max_features=6000, # increase to ensure that at least the 1,500 highest-scoring bigrams are used
                              stop_words='english', 
                              lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                              use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                              norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                              smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                             )

In [None]:
X_train_tfidf2 = vectorizer2.fit_transform(X_train)
X_test_tfidf2 = vectorizer2.transform(X_test)

In [None]:
print(cross_val_score(lrc, X_train_tfidf2, y_train, scoring=mcc_scorer, cv=4))

In [None]:
print(cross_val_score(lrc, X_train_tfidf2, y_groupings_train, scoring=mcc_scorer, cv=4))

Neither tokenization nor adding bigrams significantly improved performance, so tune the hyperparameters for logistic regression on the basic tf-idf data.

In [None]:
lrc_params = [{'penalty': ['l1', 'l2'], 'tol': [1e-4, 1e-3, 1e-2], 'C': [3, 5, 7, 10],
               'solver': ['liblinear'], 'multi_class': ['ovr']},
              { 'penalty': ['l2'], 'tol': [1e-4, 1e-3, 1e-2], 'C': [3, 5, 7, 10],
               'solver': ['newton-cg', 'lbfgs'], 'multi_class': ['ovr', 'multinomial']}]

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [None]:
def optimize(rand, X=tweets_df['text'], Y=tweets_df['author'], transformer=None,
             clf=lrc, params=lrc_params, t_size=0.4):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=t_size, random_state=rand, stratify=Y)
    if transformer:
        X_train = transformer.fit_transform(X_train)
        X_test = transformer.transform(X_test)
    model = GridSearchCV(clf, param_grid=params, scoring=mcc_scorer)
    model.fit(X_train, Y_train)
    print('Optimal parameters: {}'.format(model.best_params_))
    preds = model.predict(X_test)
    print('Matthews score: {}'.format(matthews_corrcoef(Y_test, preds)))
    return model

In [None]:
opt1 = optimize(rand=7, transformer=vectorizer)

In [None]:
opt2 = optimize(rand=29, transformer=vectorizer)

In [None]:
opt3 = optimize(rand=41, transformer=vectorizer)

The model seems relatively stable. Further tweaking could probably nail down the optimal C value between 4 and 5, and the tolerance between .oo1 and .0001, but the optimation consistently selected the liblinear solver with the L2 penalty and the one-vs-rest multi-class strategy. See if varying the train/test ratio can improve performance.

In [None]:
opt4 = optimize(rand=7, transformer=vectorizer, t_size=0.35)

In [None]:
opt5 = optimize(rand=7, transformer=vectorizer, t_size=0.3)

The model is not overly sensitive to the relative sizes of the train/test split, although both 65/35 (best score) and 70/30 perform slightly better than 60/40. The decrease in the C parameter on the 70/30 split, however, might indicate that the model is starting to overfit, with larger regression coefficients.

In [None]:
model_f1 = f1_score(y_test, opt4.best_estimator_.predict(vectorizer.transform(X_test)), average=None)

In [None]:
model_f1

In [None]:
model_f1.mean()

The mean of the average F1 score, a more common metric, on the holdout test set for the 15-class predictions using the optimized logistic regression model is 0.89. The F1 scores for the individual classes ranged from 0.77 to 0.98, with 8 of the 15 classes scoring above .90 and only 1 scoring below .82.

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle

In [None]:
y_test_bin = label_binarize(y_test, classes=list(names))
y_pred_bin = label_binarize(opt4.best_estimator_.predict(vectorizer.transform(X_test)), classes=list(names))

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(names)):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
colors = cycle(['darkorange', 'cornflowerblue', 'lime', 'deeppink', 'darkviolet'])
plt.rcParams["figure.figsize"] = [12.0, 6.0]

for i, color in zip(range(len(names)), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='Class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curves for classification by individual authors')
plt.legend(loc="lower right")
plt.savefig('ROC')
plt.show()

The ROC curves and AUC-ROC scores shows that the optimal model performs very well on classifying the tweets by individual author. (Only one author has a score less than .90). However, the curves and scores seem perhaps too good. With more time, it would be worth constructing TOC curves, which include the true and false negative results as well.

## Clustering (again)

Try clustering with bigrams added into the tf-idf vector features. 

In [None]:
auth_vectors2 = vectorizer2.fit_transform(tweets_by_author['combined'])
terms2 = vectorizer2.get_feature_names()
cluster_df2 = pd.DataFrame(auth_vectors2.toarray(), index=names, columns=terms2)

clusters = []
cluster_data2 = normalize(cluster_df2)
for _ in range(1000):
    best_score = -999
    num_labels = 0
    for num_clusters in range(5, 10):
        km = KMeans(num_clusters)
        km.fit(cluster_data2)
        sil_score = silhouette_score(cluster_data2, km.labels_)
        if sil_score > best_score:
            best_score = sil_score
            num_labels = num_clusters
    clusters.append(num_labels)

optimal_num_clusters = Counter(clusters).most_common(1)[0][0]
print(Counter(clusters).most_common())
print('The optimal number of clusters is {}'.format(optimal_num_clusters))
      
best_labels = []
best_score = -999

for _ in range(1000):
    km = KMeans(optimal_num_clusters)
    km.fit(cluster_data2)
    sil_score = silhouette_score(cluster_data2, km.labels_)
    if sil_score > best_score:
        best_score = sil_score
        best_labels = km.labels_

for i in range(optimal_num_clusters):
    print('Cluster {} comprises {}'.format(i, [name for ix, name in enumerate(names) if best_labels[ix] == i]))

One of the individuals has been brought into the 'deep learning' group.

One last variation will be to match hashtags to concatenated bigrams, and add as many bigrams as matches to the combined tweets for each author. This will allow similarity measures to factor in matches between the bigram and hashtag variants of common two-word phrases, such as 'deep learning,' across authors. This will be used only for clustering, though, since hashtags vs. bigrams is a useful difference for classification purposes. It is expected that this could result in fewer, larger clusters by generating larger similarity scores between authors. 

In [None]:
vectorizer3 = TfidfVectorizer(ngram_range=(2,2), 
                              max_df=0.5, # drop words that occur in more than half the paragraphs
                              min_df=2, # only use words that appear at least twice
                              stop_words='english', 
                              lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                              use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                              norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                              smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                             )

In [None]:
vectorizer3.fit_transform(tweets_by_author['combined'])
bigrams = vectorizer3.get_feature_names()

In [None]:
# add the bigrams

def expand(author):
    addition = []
    for bigram in bigrams:
        num_tags = all_tags[author][bigram.replace(' ', '')]
        if num_tags:
            addition.extend([bigram] * num_tags)
    return ' ' + ' '.join(addition)

In [None]:
transfer = pd.DataFrame(index=names, columns=['extended'])
for name in names:
    transfer.loc[name, 'extended'] = tweets_by_author.loc[name, 'combined'] + expand(name)

In [None]:
auth_vectors3 = vectorizer2.fit_transform(transfer['extended'])
terms3 = vectorizer2.get_feature_names()
cluster_df3 = pd.DataFrame(auth_vectors3.toarray(), index=names, columns=terms3)

In [None]:
clusters = []
cluster_data3 = normalize(cluster_df3)
for _ in range(1000):
    best_score = -999
    num_labels = 0
    for num_clusters in range(3, 8):
        km = KMeans(num_clusters)
        km.fit(cluster_data3)
        sil_score = silhouette_score(cluster_data3, km.labels_)
        if sil_score > best_score:
            best_score = sil_score
            num_labels = num_clusters
    clusters.append(num_labels)

optimal_num_clusters = Counter(clusters).most_common(1)[0][0]
print(Counter(clusters).most_common())
print('The optimal number of clusters is {}'.format(optimal_num_clusters))
      
best_labels3 = []
best_score3 = -999
centers3 = []

for _ in range(1000):
    km = KMeans(optimal_num_clusters)
    km.fit(cluster_data3)
    sil_score = silhouette_score(cluster_data3, km.labels_)
    if sil_score > best_score3:
        best_score3 = sil_score
        best_labels3 = km.labels_
        centers3 = km.cluster_centers_

for i in range(optimal_num_clusters):
    print('Cluster {} comprises {}'.format(i, [name for ix, name in enumerate(names) if best_labels3[ix] == i]))

This technique produces the most compact clustering. The number of clusters has come down to 5, with the other former individual joining the group that seems to be based on network connections. But EvanSinar, who despite using more than 600 hashtags and mentions initially was put in with the group that used few or none of these entities, now is grouped as an individual. This would support a hypothesis that this author used entities not used by the other authors, since "double counting" these as bigrams would isolate this person from both those that used more common entities and those that used few or none. It appears that hashtags and mentions dominated the clustering even when the full tweet texts were used.

In [None]:
centroids3 = centers3.argsort()[:, ::-1]
words_df3 = pd.DataFrame(cluster_data3, index=names, columns=terms3)
groupings3 = dict(zip(names, best_labels3))
print('High-impact words close to the centroid and common to all members in each cluster\n\n')
for i in range(optimal_num_clusters):
    print('Cluster {}:'.format(i))
    people = [name for name in names if groupings3[name] == i]
    words = []
    for ind in centroids3[i]:
        word = terms3[ind]
        common = True
        for person in people:
            if words_df3.loc[person, word] <= 0.001: # threshold lowered to produce results for cluster 0
                common = False
                break
        if common == True:
            words.append(word)
        if len(words) == 20:
            break
    print('\n', words, '\n\n')

As the clusters grow larger, it is more difficult to find terms that all members have in common. But the terms closest to the cluster centers would still sketch common themes.

In [None]:
print('Words closest to the centroid of each cluster\n\n')
for i in range(optimal_num_clusters):
    print('Cluster {}:'.format(i))
    words = [terms3[ind] for ind in centroids3[i][:10]]
    print(words, '\n\n')

The former group that seemed to highlight management concerns but also might have had a stronger connection to the ODSC Europe conference has disappeared, with both members now assigned to other clusters. The largest now is the AI/machine learning group, cluster 0, comprising KirkDBorne, Ronald_vanLoon, BernardMarr, kdnuggets and analyticbridge. Next comes the cluster 1 group of four authors — AndrewYNg, mrogati, naval, hmason — who used few or no hashtags and mentions. The emerging technology group, cluster 4, now has three members: craigbrownphd, BigDataGal and tamaradull. Interestingly, cluster 2, the two authors seemingly grouped together via networking — bobehayes and data_nerd — now has some strong thematic content having to do with machine learning and AI. But it remains separate. Also interesting is that cluster 3, the only remaining group of one, EvanSinar, has an especially tight focus on data stories and visualization.

In [None]:
best_score1

In [None]:
best_score3

Besides its tighter thematic ties, this final 5-cluster grouping has a silhouette score of .032, which is 25% lower than the .040 score of the 7-cluster grouping based on the initial tf=idf vectorization of the tweet texts.

In [None]:
# reduce feature space to 5 principal components and choose two with the greatest spread among authors for graphing

cluster_lsa3 = svd.fit_transform(cluster_data3)

authors_by_component3 = pd.DataFrame(cluster_lsa3, index=names)
for i in range(5):
    print('Component {}:'.format(i))
    print(authors_by_component3.loc[:,i].sort_values(ascending=False)[0:15])

In [None]:
variance_explained3 = svd.explained_variance_ratio_
for i in range(5):
    print('Component {} explains {} of the total variance:'.format(i, variance_explained3[i]))

*Note: It seems odd that the first component explains less of the variance than any of the next 4.*

In [None]:
authors_by_component3['label'] = best_labels3
cmap = sns.color_palette('cubehelix', 5)
ax = sns.scatterplot(data=authors_by_component3, x=1, y=3, hue='label', palette=cmap)
ax.set_title('Clusters plotted against 2 of 5 principal components')
ax.set_ylabel('lsa3')
ax.set_xlabel('lsa1')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('cluster_plot')

In the final clustering, none of the clusters overlap even though the two components being graphed explain less than 20% of the total variance. This low explained variance also provides reasonable cause for the two points closest together in the graph, near (0.3, -0.4) being assigned to different clusters.