In [135]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pysal.model import spreg
from pysal.lib import weights
from pysal.explore import esda
from scipy import stats
import statsmodels.api as sm
# import statsmodels.formula.api as sm

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [66]:
df = pd.read_csv('ted_main.csv')

In [67]:
df.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


In [68]:
# df.film_date = pd.to_datetime(df.film_date,unit='s')
# df = df[df.film_date.isin(pd.date_range(start='1/1/2010', end='12/31/2017'))]

In [69]:
df.related_talks = df.related_talks.apply(eval)

In [70]:
df.related_talks.apply(len).value_counts()

6    2464
3      66
1      12
2       8
Name: related_talks, dtype: int64

In [71]:
related_talks = {}

for i, talks in enumerate(df.related_talks):
    related_talks[i] = []
    
    for talk in talks:
        related_talks[i].append(talk['title'])

In [72]:
pairs = set()

titles = df.title.unique()

for i, talks in enumerate(df.related_talks):
    
    for talk in talks:
        
        if (titles[i],talk['title']) not in pairs \
            and (talk['title'],titles[i]) not in pairs \
            and talk['title'] in titles:
            
            pairs.add((titles[i],talk['title']))

In [107]:
W = pd.DataFrame(np.zeros((df.title.nunique(),df.title.nunique())),
                 index=df.title.unique(),columns=df.title.unique())

In [108]:
for pair in pairs:
    W.loc[pair[0],pair[1]]+=1
    W.loc[pair[1],pair[0]]+=1
    
# for title in titles:
#     W.loc[title,title]+=1

### Row Standard Weights wij' = wij / Σj wij

In [118]:
sum_W = W.sum(axis=1)

In [126]:
row_std_w = W.div(sum_W,axis=1)

In [76]:
y = W.columns

In [77]:
# every single movie has a vector
W2 = csr_matrix(W)

# instantiation
knn = NearestNeighbors(metric="cosine")

# consine distance determines similarities (users, movies)
# consine angle (smaller) is inverse to similarities (higher)

knn.fit(W2)

NearestNeighbors(metric='cosine')

In [93]:
x = W.loc["How America's public schools keep kids in poverty",:].values.reshape(1,-1)
distances, indices = knn.kneighbors(x, n_neighbors=7)
indices.reshape(-1)

array([2316, 2445, 1991, 1500, 2328,  203, 1420], dtype=int64)

In [94]:
for i in range(len(indices.reshape(-1))):
  if i == 0:
    print("You are currently watching:", W.index[indices.reshape(-1)[i]])
    continue
  print("You should also watch:", W.index[indices.reshape(-1)[i]])

You are currently watching: How America's public schools keep kids in poverty
You should also watch: A summer school kids actually want to attend
You should also watch: How to fix a broken school? Lead fearlessly, love hard
You should also watch: Our failing schools. Enough is enough!
You should also watch: Help for kids the education system ignores
You should also watch: My wish: Once Upon a School
You should also watch: Kids need structure


In [88]:
df.related_talks[0]

[{'id': 865,
  'hero': 'https://pe.tedcdn.com/images/ted/172559_800x600.jpg',
  'speaker': 'Ken Robinson',
  'title': 'Bring on the learning revolution!',
  'duration': 1008,
  'slug': 'sir_ken_robinson_bring_on_the_revolution',
  'viewed_count': 7266103},
 {'id': 1738,
  'hero': 'https://pe.tedcdn.com/images/ted/de98b161ad1434910ff4b56c89de71af04b8b873_1600x1200.jpg',
  'speaker': 'Ken Robinson',
  'title': "How to escape education's death valley",
  'duration': 1151,
  'slug': 'ken_robinson_how_to_escape_education_s_death_valley',
  'viewed_count': 6657572},
 {'id': 2276,
  'hero': 'https://pe.tedcdn.com/images/ted/3821f3728e0b755c7b9aea2e69cc093eca41abe1_2880x1620.jpg',
  'speaker': 'Linda Cliatt-Wayman',
  'title': 'How to fix a broken school? Lead fearlessly, love hard',
  'duration': 1027,
  'slug': 'linda_cliatt_wayman_how_to_fix_a_broken_school_lead_fearlessly_love_hard',
  'viewed_count': 1617101},
 {'id': 892,
  'hero': 'https://pe.tedcdn.com/images/ted/e79958940573cc610ccb58

### Libpysal

![image.png](attachment:image.png)

In [None]:
df.tags = df.tags.apply(eval)
df_tags = pd.get_dummies(df.tags.apply(pd.Series).stack()).sum(level=0)

In [128]:
data = df.loc[:,['duration','published_date','tags','title','views']]

In [139]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2550 entries, 0 to 2549
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   duration        2550 non-null   int64         
 1   published_date  2550 non-null   datetime64[ns]
 2   tags            2550 non-null   object        
 3   title           2550 non-null   object        
 4   views           2550 non-null   int64         
 5   year            2550 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 119.7+ KB


In [142]:
data.published_date = pd.to_datetime(data.published_date,unit='s')
data['year'] = data.published_date.apply(lambda x: x.year)

In [152]:
data = pd.concat([data,df_tags],axis=1)

In [153]:
X = data.drop(['views','title','tags','published_date'],axis=1)
y = data[['views']]

In [154]:
X = sm.add_constant(X)
model = sm.OLS(y,X)
results = model.fit()

In [155]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  views   R-squared:                       0.253
Model:                            OLS   Adj. R-squared:                  0.106
Method:                 Least Squares   F-statistic:                     1.723
Date:                Tue, 16 Nov 2021   Prob (F-statistic):           1.07e-14
Time:                        01:02:15   Log-Likelihood:                -40811.
No. Observations:                2550   AIC:                         8.246e+04
Df Residuals:                    2131   BIC:                         8.491e+04
Df Model:                         418                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [159]:
results.predict(X).mean()

1698297.4815683803

In [95]:
w = weights.KNN.from_dataframe(W, k=8)
w.transform = 'R'
w

KeyError: 'geometry'

### Generate Nodes, Edges for Network Analysis

In [None]:
nodes = pd.concat([df.title,df_tags],axis=1).reset_index()
nodes.rename(columns={'title':'label'},inplace=True)
tilte2idx = nodes.set_index('label')['index'].to_dict()

In [None]:
edges = pd.DataFrame(pairs,columns=['Talk1','Talk2'])
edges.Talk1 = edges.Talk1.apply(lambda x: tilte2idx[x])
edges.Talk2 = edges.Talk2.apply(lambda x: tilte2idx[x])

In [None]:
nodes.to_csv('nodes.csv',index=None)
edges.to_csv('edges.csv',index=None)