In [48]:
import ast
import numpy as np
import pandas as pd
import pickle
import re

from collections import Counter
from collections import defaultdict
from sklearn import metrics
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [84]:
NUM_RATINGS_TYPES = 26
NUM_TAGS_TYPES = 20
CLUSTER_NUM = 50
DROP_ROWS = ['main_speaker', 'description', 'main_speaker', 'name', 'tags', 'ratings', 'related_talks', 'speaker_occupation', 'title', 'url']

'''
Rating Reference
{1: 'Beautiful',
 2: 'Confusing',
 3: 'Courageous',
 7: 'Funny',
 8: 'Informative',
 9: 'Ingenious',
 10: 'Inspiring',
 11: 'Longwinded',
 21: 'Unconvincing',
 22: 'Fascinating',
 23: 'Jaw-dropping',
 24: 'Persuasive',
 25: 'OK',
 26: 'Obnoxious'}
'''

"\nRating Reference\n{1: 'Beautiful',\n 2: 'Confusing',\n 3: 'Courageous',\n 7: 'Funny',\n 8: 'Informative',\n 9: 'Ingenious',\n 10: 'Inspiring',\n 11: 'Longwinded',\n 21: 'Unconvincing',\n 22: 'Fascinating',\n 23: 'Jaw-dropping',\n 24: 'Persuasive',\n 25: 'OK',\n 26: 'Obnoxious'}\n"

In [85]:
tedData = pd.read_csv('ted_main.csv', converters={"views":int})

In [86]:
def build_ratings_tags_dict(ratings_tags_dict, data):
    for talk_idx in range(len(data)):
        rating_lst = ast.literal_eval(data['ratings'][talk_idx])
        for rating in rating_lst:
            rating_id = rating['id']
            name = rating['name']
            ratings_tags_dict[rating_id] = name
            if len(ratings_tags_dict) == NUM_RATINGS_TYPES:
                break
    return ratings_tags_dict

ratings_tags_dict = build_ratings_tags_dict({}, tedData)

In [87]:
def ratings_conversion(data):
    rows = data.shape[0]
    ratings_mtx = np.zeros((rows, NUM_RATINGS_TYPES))
    for talk_idx in range(len(data)):
        rating_lst = ast.literal_eval(data['ratings'][talk_idx])
        for rating in rating_lst:
            rating_id = rating['id'] - 1 # subtract for 0 index
            count = rating['count']
            ratings_mtx[talk_idx,rating_id] = count
    row_sum = np.sum(ratings_mtx, axis=1)
    ratings_mtx = ratings_mtx / row_sum.reshape(row_sum.size,1)
    # Data Clean
    valid_indices = np.sum(ratings_mtx, axis=0).nonzero()[0]
    ratings_mtx = ratings_mtx[:,valid_indices]
    df = pd.DataFrame(ratings_mtx, columns=list(ratings_tags_dict.values()))
    return df

In [88]:
ratings_mtx = ratings_conversion(tedData)

In [89]:
ratings_mtx

Unnamed: 0,Beautiful,Confusing,Courageous,Funny,Informative,Ingenious,Inspiring,Longwinded,Unconvincing,Fascinating,Jaw-dropping,Persuasive,OK,Obnoxious
0,0.048727,0.002579,0.034662,0.209323,0.078274,0.064710,0.265573,0.004124,0.003197,0.112744,0.047299,0.114054,0.012509,0.002227
1,0.019755,0.021117,0.047343,0.185286,0.150886,0.019074,0.140668,0.038488,0.087875,0.044959,0.039510,0.091281,0.069142,0.044619
2,0.021246,0.009561,0.015935,0.341360,0.139873,0.064802,0.081445,0.027620,0.036827,0.058782,0.019122,0.081445,0.051700,0.050283
3,0.078058,0.008584,0.203863,0.015826,0.101931,0.028165,0.287017,0.014217,0.009657,0.035408,0.061695,0.123391,0.022800,0.009388
4,0.036768,0.002810,0.012412,0.054254,0.212061,0.124980,0.112920,0.004294,0.002615,0.179781,0.145824,0.099219,0.009680,0.002381
5,0.045976,0.019601,0.046952,0.071763,0.067596,0.025853,0.339346,0.018560,0.024551,0.087914,0.043566,0.157788,0.028718,0.021816
6,0.046182,0.012004,0.079527,0.414138,0.037679,0.053351,0.073024,0.032177,0.037513,0.052684,0.010170,0.051851,0.051350,0.048349
7,0.095703,0.024414,0.033203,0.008789,0.118164,0.190430,0.170898,0.025391,0.025391,0.149414,0.055664,0.036133,0.061523,0.004883
8,0.036746,0.017739,0.142423,0.015205,0.157881,0.053472,0.111759,0.044856,0.071718,0.079321,0.015205,0.193614,0.039027,0.021034
9,0.092110,0.013330,0.057163,0.003122,0.036508,0.013931,0.306953,0.036748,0.100636,0.055122,0.025099,0.185301,0.029422,0.044554


In [90]:
def transformEventData(data):
    categorical = []
    for col, value in data.iteritems():
        if col == 'event':
            categorical.append(col)
    numerical = data.columns.difference(categorical)
    data_cat = data[categorical]
    data_cat = pd.get_dummies(data_cat)
    data_num = data[numerical]
    return pd.concat([data_num, data_cat], axis=1)

In [91]:
trans_tedData = transformEventData(tedData).drop(DROP_ROWS, axis=1)

In [92]:
def build_tag_dict(data):
    tag_dict = defaultdict(int)
    for talk_idx in range(len(data)):
        tag_lst = ast.literal_eval(data['tags'][talk_idx])
        for tag in tag_lst:
            tag_dict[tag] += 1
    tag_count = Counter(tag_dict)
    return [tup[0] for tup in tag_count.most_common(NUM_TAGS_TYPES)]

top_tags = build_tag_dict(tedData)

In [93]:
top_tags

['technology',
 'science',
 'global issues',
 'culture',
 'TEDx',
 'design',
 'business',
 'entertainment',
 'health',
 'innovation',
 'society',
 'art',
 'social change',
 'future',
 'communication',
 'biology',
 'creativity',
 'humanity',
 'collaboration',
 'environment']

In [94]:
def tags_conversion(data):
    rows = data.shape[0]
    tags_mtx = np.zeros((rows, NUM_TAGS_TYPES))
    for talk_idx in range(len(data)):
        tag_lst = ast.literal_eval(data['tags'][talk_idx])
        for tag_ind in range(len(top_tags)):
            top_tag = top_tags[tag_ind]
            if top_tag in tag_lst:
                tags_mtx[talk_idx,tag_ind] = 1
    df = pd.DataFrame(tags_mtx, columns=top_tags)
    return df

In [95]:
tags_data = tags_conversion(tedData)

In [96]:
tags_data

Unnamed: 0,technology,science,global issues,culture,TEDx,design,business,entertainment,health,innovation,society,art,social change,future,communication,biology,creativity,humanity,collaboration,environment
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
cleaned_tedData = pd.concat([tags_data, trans_tedData,ratings_mtx], axis=1)

In [98]:
cleaned_tedData

Unnamed: 0,technology,science,global issues,culture,TEDx,design,business,entertainment,health,innovation,...,Informative,Ingenious,Inspiring,Longwinded,Unconvincing,Fascinating,Jaw-dropping,Persuasive,OK,Obnoxious
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.078274,0.064710,0.265573,0.004124,0.003197,0.112744,0.047299,0.114054,0.012509,0.002227
1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.150886,0.019074,0.140668,0.038488,0.087875,0.044959,0.039510,0.091281,0.069142,0.044619
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.139873,0.064802,0.081445,0.027620,0.036827,0.058782,0.019122,0.081445,0.051700,0.050283
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.101931,0.028165,0.287017,0.014217,0.009657,0.035408,0.061695,0.123391,0.022800,0.009388
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.212061,0.124980,0.112920,0.004294,0.002615,0.179781,0.145824,0.099219,0.009680,0.002381
5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.067596,0.025853,0.339346,0.018560,0.024551,0.087914,0.043566,0.157788,0.028718,0.021816
6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037679,0.053351,0.073024,0.032177,0.037513,0.052684,0.010170,0.051851,0.051350,0.048349
7,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.118164,0.190430,0.170898,0.025391,0.025391,0.149414,0.055664,0.036133,0.061523,0.004883
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.157881,0.053472,0.111759,0.044856,0.071718,0.079321,0.015205,0.193614,0.039027,0.021034
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.036508,0.013931,0.306953,0.036748,0.100636,0.055122,0.025099,0.185301,0.029422,0.044554


## KMeans Clustering

In [99]:
kmeans = KMeans(init='k-means++', n_clusters=2, n_init=CLUSTER_NUM)
kmeans.fit(cleaned_tedData)
kmeanlabels=kmeans.labels_

In [101]:
def build_cluster_mapping_and_size(cluster_labels):
    cnt_dict_size = defaultdict(int)
    cnt_dict = defaultdict(list)
    tedId_to_clusterId = {}
    for idx in range(len(cluster_labels)):
        clust_label = cluster_labels[idx]
        tedId_to_clusterId[idx] = clust_label
        cnt_dict_size[clust_label] += 1
        cnt_dict[clust_label].append(idx)
    return cnt_dict, cnt_dict_size, tedId_to_clusterId

In [None]:
clusterId_to_tedId, cnt_dict_size, tedId_to_clusterId = build_cluster_mapping_and_size(kmeanlabels)

In [None]:
cnt_dict_size

In [28]:
pickle.dump(clusterId_to_tedId, open( "data/clus50K+clusterId_to_tedId2.pickle", "wb" ), protocol=2)

In [29]:
pickle.dump(tedId_to_clusterId, open( "data/clus50K+tedId_to_clusterId2.pickle", "wb" ), protocol=2)

In [73]:
copy_tedData = tedData
copy_tedData = copy_tedData.drop(['ratings'], axis=1)
copy_tedData = pd.concat([copy_tedData,ratings_mtx], axis=1)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,Informative,Ingenious,Inspiring,Longwinded,Unconvincing,Fascinating,Jaw-dropping,Persuasive,OK,Obnoxious
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,...,0.078274,0.064710,0.265573,0.004124,0.003197,0.112744,0.047299,0.114054,0.012509,0.002227
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,...,0.150886,0.019074,0.140668,0.038488,0.087875,0.044959,0.039510,0.091281,0.069142,0.044619
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,...,0.139873,0.064802,0.081445,0.027620,0.036827,0.058782,0.019122,0.081445,0.051700,0.050283
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,...,0.101931,0.028165,0.287017,0.014217,0.009657,0.035408,0.061695,0.123391,0.022800,0.009388
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,...,0.212061,0.124980,0.112920,0.004294,0.002615,0.179781,0.145824,0.099219,0.009680,0.002381
5,672,"Tony Robbins discusses the ""invisible forces"" ...",1305,TED2006,1138838400,36,Tony Robbins,Tony Robbins: Why we do what we do,1,1151440680,...,0.067596,0.025853,0.339346,0.018560,0.024551,0.087914,0.043566,0.157788,0.028718,0.021816
6,919,When two young Mormon missionaries knock on Ju...,992,TED2006,1140739200,31,Julia Sweeney,Julia Sweeney: Letting go of God,1,1152490260,...,0.037679,0.053351,0.073024,0.032177,0.037513,0.052684,0.010170,0.051851,0.051350,0.048349
7,46,Architect Joshua Prince-Ramus takes the audien...,1198,TED2006,1140652800,19,Joshua Prince-Ramus,Joshua Prince-Ramus: Behind the design of Seat...,1,1152490260,...,0.118164,0.190430,0.170898,0.025391,0.025391,0.149414,0.055664,0.036133,0.061523,0.004883
8,852,Philosopher Dan Dennett calls for religion -- ...,1485,TED2006,1138838400,32,Dan Dennett,Dan Dennett: Let's teach religion -- all relig...,1,1153181460,...,0.157881,0.053472,0.111759,0.044856,0.071718,0.079321,0.015205,0.193614,0.039027,0.021034
9,900,"Pastor Rick Warren, author of ""The Purpose-Dri...",1262,TED2006,1140825600,31,Rick Warren,Rick Warren: A life of purpose,1,1153181460,...,0.036508,0.013931,0.306953,0.036748,0.100636,0.055122,0.025099,0.185301,0.029422,0.044554


In [75]:
copy_tedData.to_csv("ted_talks_ratings.csv")

## Spectral Clustering

In [44]:
svd_similarity = np.load("svd_similarity.npy")
rows = tedData.shape[0]

In [6]:
spec_ted = np.zeros((rows,rows))

In [64]:
tedId_to_related = {}
title_to_tedId = {}

for idx, row in tedData.iterrows():
    title = str(re.sub(' +',' ', row['title'].decode('ascii', 'ignore')))
    title_to_tedId[title] = idx

In [65]:
print(len(ast.literal_eval(tedData.iloc[[0]]['related_talks'][0])))
# print(tedData.iloc[[0]]['related_talks'][0])

6


In [66]:
for idx, row in tedData.iterrows():
    related_talks = ast.literal_eval(row['related_talks'])
    for rel_talk in related_talks:
        title = str(re.sub(' +',' ', rel_talk['title'].decode('ascii', 'ignore')))
        rel_id = title_to_tedId[title]
        if idx in tedId_to_related:
            tedId_to_related[idx].append(rel_id)
        else:
            tedId_to_related[idx] = [rel_id]

In [67]:
tedId_to_related[0]

[692, 1502, 1991, 715, 1023, 2316]

In [80]:
import math
for talkId, rel_talk_lst in tedId_to_related.items():
    for relId in rel_talk_lst:
        sim = svd_similarity[talkId, relId]
        spec_ted[talkId, relId] = max(sim,0)

In [108]:
sc = SpectralClustering(n_clusters=CLUSTER_NUM+50, affinity='precomputed', n_init=100)
sc.fit(spec_ted)
sc_labels = sc.labels_

In [109]:
sc_clusterId_to_tedId, sc_cnt_dict_size, sc_tedId_to_clusterId = build_cluster_mapping_and_size(sc_labels)

In [110]:
sc_cnt_dict_size

defaultdict(int,
            {0: 21,
             1: 8,
             2: 29,
             3: 53,
             4: 11,
             5: 26,
             6: 6,
             7: 4,
             8: 5,
             9: 4,
             10: 6,
             11: 18,
             12: 6,
             13: 23,
             14: 7,
             15: 7,
             16: 8,
             17: 10,
             18: 6,
             19: 12,
             20: 7,
             21: 3,
             22: 6,
             23: 12,
             24: 27,
             25: 5,
             26: 16,
             27: 1259,
             28: 13,
             29: 6,
             30: 15,
             31: 14,
             32: 5,
             33: 6,
             34: 5,
             35: 6,
             36: 17,
             37: 10,
             38: 11,
             39: 4,
             40: 5,
             41: 11,
             42: 26,
             43: 9,
             44: 27,
             45: 6,
             46: 11,
             47: 17,
       