## Linear Programming in Python : Create Watch List for TED Videos  ##
Inspired by https://www.analyticsvidhya.com/blog/2017/10/linear-optimization-in-python/

### Import the Library ###  
PuLP, the optimization package embedded in python.

In [1]:
% matplotlib inline

from pulp import *
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from IPython.display import Image, display


### Load the Data ###

In [2]:
# Download the dataset from https://www.kaggle.com/rounakbanik/ted-talks

# Read the dataset into pandas dataframe, convert duration from seconds to minutes
ted = pd.read_csv('./data/ted_main.csv', encoding='ISO-8859-1')
ted['duration'] = ted['duration'] / 60

In [3]:
print(ted[:4])

   comments                                        description   duration  \
0      4553  Sir Ken Robinson makes an entertaining and pro...  19.400000   
1       265  With the same humor and humanity he exuded in ...  16.283333   
2       124  New York Times columnist David Pogue takes aim...  21.433333   
3       200  In an emotionally charged talk, MacArthur-winn...  18.600000   

     event   film_date  languages   main_speaker  \
0  TED2006  1140825600         60   Ken Robinson   
1  TED2006  1140825600         43        Al Gore   
2  TED2006  1140739200         26    David Pogue   
3  TED2006  1140912000         35  Majora Carter   

                                        name  num_speaker  published_date  \
0  Ken Robinson: Do schools kill creativity?            1      1151367060   
1       Al Gore: Averting the climate crisis            1      1151367060   
2              David Pogue: Simplicity sells            1      1151367060   
3         Majora Carter: Greening the ghetto 

In [4]:
ted = ted.round({'duration': 1})

In [None]:
display(ted.shape)
display(ted.head(3))

In [None]:
# Select subset of columns & rows (if required)
# data = ted.sample(n=1000) # 'n' can be changed as required
data = ted
selected_cols = ['name', 'event', 'duration', 'views']
data.reset_index(inplace=True)

### The Problem Statement ###  
Select the maximum number of videos to watch  
https://pythonhosted.org/PuLP/pulp.html?highlight=lpproblem#pulp.LpProblem

In [None]:
# create LP object,
# set up as a maximization problem --> since we want to maximize the number of TED talks to watch
prob = pulp.LpProblem('WatchingTEDTalks', pulp.LpMaximize)

### To Be (Watched) Or Not to Be ###  
Define the variable in pulp platform

In [None]:
# create decision - yes or no to watch the talk?
decision_variables = []
for rownum, row in data.iterrows():
    # variable = set('x' + str(rownum))
    variable = str('x' + str(row['index']))
    variable = pulp.LpVariable(str(variable), lowBound = 0, upBound = 1, cat = 'Integer') # make variable binary
    decision_variables.append(variable)
    
print('Total number of decision variables: ' + str(len(decision_variables)))

In [None]:
#print(decision_variables)

### Create Optimization Function ###

In [None]:
# Create optimization Function
total_views = ''
for rownum, row in data.iterrows():
    formula = row['views'] * decision_variables[rownum]
    total_views += formula
            
prob += total_views
# print('Optimization function: ' + str(total_views))

### Contraints ###

In [None]:
# Contraints
total_time_available_for_talks = 10*60 # Total time available is 10 hours . Converted to minutes
total_talks_can_watch = 25 # Don't want an overload information

### First Constraint ###

In [None]:
# Create Constraint 1 - Time for talks
total_time_talks = ''
for rownum, row in data.iterrows():
    formula = row['duration'] * decision_variables[rownum]
    total_time_talks += formula
            
prob += (total_time_talks == total_time_available_for_talks)

### Second Constraint ###

In [None]:
# Create Constraint 2 - Number of talks
total_talks = ''

for rownum, row in data.iterrows():
    formula = decision_variables[rownum]
    total_talks += formula
            
prob += (total_talks == total_talks_can_watch)

In [None]:
#print(prob)
prob.writeLP('WatchingTEDTalks.lp')

In [None]:
optimization_result = prob.solve()

assert optimization_result == pulp.LpStatusOptimal
print('Status:', LpStatus[prob.status])
print('Optimal Solution to the problem: ', value(prob.objective))
print('Individual decision variables: ')
for v in prob.variables():
    print(v.name, '=', v.varValue)

In [None]:
# reorder results
variable_name = []
variable_value = []

for v in prob.variables():
    variable_name.append(v.name)
    variable_value.append(v.varValue)
    
df = pd.DataFrame({'index': variable_name, 'value': variable_value})

In [None]:
display(df.head(3))

In [None]:
## \d
## https://docs.python.org/3.1/library/re.html
## Matches any Unicode digit (which includes [0-9], and also many other digit characters).
for rownum, row in df.iterrows():
    value = re.findall(r'(\d+)', row['index'])
    df.loc[rownum, 'index'] = int(value[0])

In [None]:
display(df.head(3))

In [None]:
# df = df.sort_index(by = 'index')
df = df.sort_values(by = 'index')

In [None]:
display(df.head(3))

In [None]:
display(data.head(3))

In [None]:
result = pd.merge(data, df, on = 'index')
result = result[result['value'] == 1].sort_values(by = 'views', ascending = False)

### Select Desired Entries ###

In [None]:
selected_cols_final = ['name', 'event', 'duration', 'views']
final_set_of_talks_to_watch = result[selected_cols_final]

In [None]:
from IPython.display import display, HTML
display(HTML(final_set_of_talks_to_watch.to_html(index=False)))