In [1]:
import ast
import numpy as np
import pandas as pd
import pickle

from collections import Counter
from collections import defaultdict
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [2]:
NUM_RATINGS_TYPES = 26
NUM_TAGS_TYPES = 20
CLUSTER_NUM = 200
DROP_ROWS = ['main_speaker', 'description', 'main_speaker', 'name', 'tags', 'ratings', 'related_talks', 'speaker_occupation', 'title', 'url']

'''
Rating Reference
{1: 'Beautiful',
 2: 'Confusing',
 3: 'Courageous',
 7: 'Funny',
 8: 'Informative',
 9: 'Ingenious',
 10: 'Inspiring',
 11: 'Longwinded',
 21: 'Unconvincing',
 22: 'Fascinating',
 23: 'Jaw-dropping',
 24: 'Persuasive',
 25: 'OK',
 26: 'Obnoxious'}
'''

"\nRating Reference\n{1: 'Beautiful',\n 2: 'Confusing',\n 3: 'Courageous',\n 7: 'Funny',\n 8: 'Informative',\n 9: 'Ingenious',\n 10: 'Inspiring',\n 11: 'Longwinded',\n 21: 'Unconvincing',\n 22: 'Fascinating',\n 23: 'Jaw-dropping',\n 24: 'Persuasive',\n 25: 'OK',\n 26: 'Obnoxious'}\n"

In [3]:
tedData = pd.read_csv('ted_main.csv')

In [4]:
def build_ratings_tags_dict(ratings_tags_dict, data):
    for talk_idx in range(len(data)):
        rating_lst = ast.literal_eval(data['ratings'][talk_idx])
        for rating in rating_lst:
            rating_id = rating['id']
            name = rating['name']
            ratings_tags_dict[rating_id] = name
            if len(ratings_tags_dict) == NUM_RATINGS_TYPES:
                break
    return ratings_tags_dict

ratings_tags_dict = build_ratings_tags_dict({}, tedData)

In [5]:
def ratings_conversion(data):
    rows = data.shape[0]
    ratings_mtx = np.zeros((rows, NUM_RATINGS_TYPES))
    for talk_idx in range(len(data)):
        rating_lst = ast.literal_eval(data['ratings'][talk_idx])
        for rating in rating_lst:
            rating_id = rating['id'] - 1 # subtract for 0 index
            count = rating['count']
            ratings_mtx[talk_idx,rating_id] = count
    row_sum = np.sum(ratings_mtx, axis=1)
    ratings_mtx = ratings_mtx / row_sum.reshape(row_sum.size,1)
    # Data Clean
    valid_indices = np.sum(ratings_mtx, axis=0).nonzero()[0]
    ratings_mtx = ratings_mtx[:,valid_indices]
    df = pd.DataFrame(ratings_mtx, columns=list(ratings_tags_dict.values()))
    return df

In [6]:
ratings_mtx = ratings_conversion(tedData)

In [7]:
ratings_mtx

Unnamed: 0,Beautiful,Confusing,Courageous,Funny,Informative,Ingenious,Inspiring,Longwinded,Unconvincing,Fascinating,Jaw-dropping,Persuasive,OK,Obnoxious
0,0.048727,0.002579,0.034662,0.209323,0.078274,0.064710,0.265573,0.004124,0.003197,0.112744,0.047299,0.114054,0.012509,0.002227
1,0.019755,0.021117,0.047343,0.185286,0.150886,0.019074,0.140668,0.038488,0.087875,0.044959,0.039510,0.091281,0.069142,0.044619
2,0.021246,0.009561,0.015935,0.341360,0.139873,0.064802,0.081445,0.027620,0.036827,0.058782,0.019122,0.081445,0.051700,0.050283
3,0.078058,0.008584,0.203863,0.015826,0.101931,0.028165,0.287017,0.014217,0.009657,0.035408,0.061695,0.123391,0.022800,0.009388
4,0.036768,0.002810,0.012412,0.054254,0.212061,0.124980,0.112920,0.004294,0.002615,0.179781,0.145824,0.099219,0.009680,0.002381
5,0.045976,0.019601,0.046952,0.071763,0.067596,0.025853,0.339346,0.018560,0.024551,0.087914,0.043566,0.157788,0.028718,0.021816
6,0.046182,0.012004,0.079527,0.414138,0.037679,0.053351,0.073024,0.032177,0.037513,0.052684,0.010170,0.051851,0.051350,0.048349
7,0.095703,0.024414,0.033203,0.008789,0.118164,0.190430,0.170898,0.025391,0.025391,0.149414,0.055664,0.036133,0.061523,0.004883
8,0.036746,0.017739,0.142423,0.015205,0.157881,0.053472,0.111759,0.044856,0.071718,0.079321,0.015205,0.193614,0.039027,0.021034
9,0.092110,0.013330,0.057163,0.003122,0.036508,0.013931,0.306953,0.036748,0.100636,0.055122,0.025099,0.185301,0.029422,0.044554


In [8]:
def transformEventData(data):
    categorical = []
    for col, value in data.iteritems():
        if col == 'event':
            categorical.append(col)
    numerical = data.columns.difference(categorical)
    data_cat = data[categorical]
    data_cat = pd.get_dummies(data_cat)
    data_num = data[numerical]
    return pd.concat([data_num, data_cat], axis=1)

In [9]:
trans_tedData = transformEventData(tedData).drop(DROP_ROWS, axis=1)

In [11]:
def build_tag_dict(data):
    tag_dict = defaultdict(int)
    for talk_idx in range(len(data)):
        tag_lst = ast.literal_eval(data['tags'][talk_idx])
        for tag in tag_lst:
            tag_dict[tag] += 1
    tag_count = Counter(tag_dict)
    return [tup[0] for tup in tag_count.most_common(NUM_TAGS_TYPES)]

top_tags = build_tag_dict(tedData)

416


In [12]:
top_tags

['technology',
 'science',
 'global issues',
 'culture',
 'TEDx',
 'design',
 'business',
 'entertainment',
 'health',
 'innovation',
 'society',
 'art',
 'social change',
 'future',
 'communication',
 'creativity',
 'biology',
 'humanity',
 'collaboration',
 'environment']

In [27]:
def tags_conversion(data):
    rows = data.shape[0]
    tags_mtx = np.zeros((rows, NUM_TAGS_TYPES))
    for talk_idx in range(len(data)):
        tag_lst = ast.literal_eval(data['tags'][talk_idx])
        for tag_ind in range(len(top_tags)):
            top_tag = top_tags[tag_ind]
            if top_tag in tag_lst:
                tags_mtx[talk_idx,tag_ind] = 1
    df = pd.DataFrame(tags_mtx, columns=top_tags)
    return df

In [28]:
tags_data = tags_conversion(tedData)

In [29]:
tags_data

Unnamed: 0,technology,science,global issues,culture,TEDx,design,business,entertainment,health,innovation,society,art,social change,future,communication,biology,creativity,humanity,collaboration,environment
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
cleaned_tedData = pd.concat([tags_data, trans_tedData,ratings_mtx], axis=1)

In [31]:
cleaned_tedData

Unnamed: 0,technology,science,global issues,culture,TEDx,design,business,entertainment,health,innovation,...,Informative,Ingenious,Inspiring,Longwinded,Unconvincing,Fascinating,Jaw-dropping,Persuasive,OK,Obnoxious
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.078274,0.064710,0.265573,0.004124,0.003197,0.112744,0.047299,0.114054,0.012509,0.002227
1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.150886,0.019074,0.140668,0.038488,0.087875,0.044959,0.039510,0.091281,0.069142,0.044619
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.139873,0.064802,0.081445,0.027620,0.036827,0.058782,0.019122,0.081445,0.051700,0.050283
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.101931,0.028165,0.287017,0.014217,0.009657,0.035408,0.061695,0.123391,0.022800,0.009388
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.212061,0.124980,0.112920,0.004294,0.002615,0.179781,0.145824,0.099219,0.009680,0.002381
5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.067596,0.025853,0.339346,0.018560,0.024551,0.087914,0.043566,0.157788,0.028718,0.021816
6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037679,0.053351,0.073024,0.032177,0.037513,0.052684,0.010170,0.051851,0.051350,0.048349
7,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.118164,0.190430,0.170898,0.025391,0.025391,0.149414,0.055664,0.036133,0.061523,0.004883
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.157881,0.053472,0.111759,0.044856,0.071718,0.079321,0.015205,0.193614,0.039027,0.021034
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.036508,0.013931,0.306953,0.036748,0.100636,0.055122,0.025099,0.185301,0.029422,0.044554


In [32]:
kmeans = KMeans(init='k-means++', n_clusters=CLUSTER_NUM, n_init=CLUSTER_NUM)
kmeans.fit(cleaned_tedData)
kmeanlabels=kmeans.labels_

In [47]:
def build_cluster_mapping_and_size(cluster_labels):
    cnt_dict_size = defaultdict(int)
    cnt_dict = defaultdict(list)
    tedId_to_clusterId = {}
    for idx in range(len(cluster_labels)):
        clust_label = cluster_labels[idx]
        tedId_to_clusterId[idx] = clust_label
        cnt_dict_size[clust_label] += 1
        cnt_dict[clust_label].append(idx)
    return cnt_dict, cnt_dict_size, tedId_to_clusterId

clusterId_to_tedId, cnt_dict_size, tedId_to_clusterId = build_cluster_mapping_and_size(kmeanlabels)

In [48]:
clusterId_to_tedId

defaultdict(list,
            {0: [2099,
              2100,
              2101,
              2106,
              2108,
              2110,
              2113,
              2115,
              2116,
              2117,
              2118,
              2119,
              2120,
              2123,
              2124,
              2125,
              2127,
              2128,
              2129,
              2131,
              2133,
              2134,
              2135,
              2136,
              2137,
              2138,
              2139,
              2146,
              2147,
              2148,
              2149,
              2150,
              2151,
              2155],
             1: [614,
              615,
              616,
              618,
              620,
              622,
              624,
              626,
              628,
              630,
              631,
              633,
              634,
              635,
              638,
          

In [45]:
tedData.loc[[366]]

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
366,68,In this unmissable look at the magic of comics...,1028,TED2005,1109548800,25,Scott McCloud,Scott McCloud: The visual magic of comics,1,1231808400,"[{'id': 9, 'name': 'Ingenious', 'count': 217},...","[{'id': 215, 'hero': 'https://pe.tedcdn.com/im...",Cartoonist,"['art', 'books', 'children', 'computers', 'cre...",The visual magic of comics,https://www.ted.com/talks/scott_mccloud_on_com...,1022333


In [41]:
tedData.loc[[291]]

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
291,68,"Storyteller Carmen Agra Deedy spins a funny, w...",1414,TED2005,1107302400,23,Carmen Agra Deedy,"Carmen Agra Deedy: Once upon a time, my mother...",1,1221181200,"[{'id': 25, 'name': 'OK', 'count': 84}, {'id':...","[{'id': 60, 'hero': 'https://pe.tedcdn.com/ima...",Storyteller,"['children', 'entertainment', 'memory', 'story...","Once upon a time, my mother ...",https://www.ted.com/talks/carmen_agra_deedy_sp...,672797


In [50]:
pickle.dump(clusterId_to_tedId, open( "clus200KM++clusterId_to_tedId.p", "wb" ) )

In [51]:
pickle.dump(clusterId_to_tedId, open( "clus200KM++labels.p", "wb" ) )

In [12]:
copy_tedData = tedData
copy_tedData = copy_tedData.drop(['ratings'], axis=1)
copy_tedData = pd.concat([copy_tedData,ratings_mtx], axis=1)

In [13]:
copy_tedData.to_csv("ted_talks_ratings.csv")