In [1]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/data/video_label/Eluvio_DS_Challenge.csv")
df.shape

(509236, 8)

In [2]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [3]:
df.isnull().sum()

time_created    0
date_created    0
up_votes        0
down_votes      0
title           0
over_18         0
author          0
category        0
dtype: int64

In [4]:
# check how many category included in df
df.category.nunique()

1

In [5]:
# check if there is any down votes
df.down_votes.nunique()

1

### Data Preprocessing

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import LancasterStemmer
import string
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [8]:
def text_cleaning(df):
    for i, text in enumerate(df['title']):
        # remove punctuation
        s = text.translate(str.maketrans('', '', string.punctuation))
        word_tokens = word_tokenize(s)
        # remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_s = [w for w in word_tokens if w not in stop_words]
        filtered_s = ' '.join(filtered_s)
        # stemming
        my_stemmer = LancasterStemmer()
        stemmed = my_stemmer.stem(filtered_s)
        df.at[i, 'title'] = stemmed
        if (i%100000 == 0): print(i,"completed.")

In [9]:
# text cleaning
text_cleaning(df)

0 completed.
100000 completed.
200000 completed.
300000 completed.
400000 completed.
500000 completed.


In [10]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,scores killed pakistan clashe,False,polar,worldnews
1,1201232075,2008-01-25,2,0,japan resumes refuelling misj,False,polar,worldnews
2,1201232523,2008-01-25,3,0,us presses egypt gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,jumpstart economy give health care,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,council europe bashes euun terror blacklist,False,mhermans,worldnews


In [11]:
for i,value in enumerate(df['title']):
  keyword = value + " " + str(df['over_18'][i]) + " " + df['author'][i]
  df.at[i, 'keyword'] = keyword

In [12]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category,keyword
0,1201232046,2008-01-25,3,0,scores killed pakistan clashe,False,polar,worldnews,scores killed pakistan clashe False polar
1,1201232075,2008-01-25,2,0,japan resumes refuelling misj,False,polar,worldnews,japan resumes refuelling misj False polar
2,1201232523,2008-01-25,3,0,us presses egypt gaza border,False,polar,worldnews,us presses egypt gaza border False polar
3,1201233290,2008-01-25,1,0,jumpstart economy give health care,False,fadi420,worldnews,jumpstart economy give health care False fadi420
4,1201274720,2008-01-25,4,0,council europe bashes euun terror blacklist,False,mhermans,worldnews,council europe bashes euun terror blacklist Fa...


In [34]:
X = np.array(df["title"])
y = np.array(df["up_votes"])

In [35]:
print(X.shape, y.shape)

(509236,) (509236,)


In [36]:
# tokenize and vectorize
def token_vectorize(X):
  tokenizer = Tokenizer(num_words=50000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
  tokenizer.fit_on_texts(X)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  X = tokenizer.texts_to_sequences(X)
  X = pad_sequences(X)
  print('Shape of data tensor:', X.shape)
  return X

In [37]:
X = token_vectorize(X)

Found 119278 unique tokens.
Shape of data tensor: (509236, 47)


In [38]:
# split train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 1. Votes Prediction

For Creator: if title attactive enough for more votes?     
use title to predict up_votes, perform Natural language processing on title.     
a. use LSTM to predict     
b. use Machine learning algorithms (Linear Regression, K-NN Regressor, Random Forest Regressor)

### 1a. LSTM



In [39]:
#normalize y
from sklearn import preprocessing
import numpy as np
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
normalized_y_train = preprocessing.normalize(y_train)
normalized_y_test = preprocessing.normalize(y_test)

In [40]:
print("train:", X_train.shape, y_train.shape, normalized_y_train.shape)
print("test:", X_test.shape, y_test.shape, normalized_y_test.shape)

train: (356465, 47) (356465, 1) (356465, 1)
test: (152771, 47) (152771, 1) (152771, 1)


In [41]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

model = Sequential()
model.add(Embedding(500000, 100, input_length=X_train.shape[1]))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 47, 100)           50000000  
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 50,080,501
Trainable params: 50,080,501
Non-trainable params: 0
_________________________________________________________________


In [43]:
# model training
model.fit(X_train, normalized_y_train, epochs=2, batch_size=128)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f007fa2d0d0>

In [44]:
# prediction
results_lstm = model.predict(X_test)

# mean accuracy
score_lstm = model.evaluate(X_test,normalized_y_test)



In [49]:
print("accuracy:", (score_lstm[1])*100, "%")

accuracy: 81.69940710067749 %


### 1b. Machine Learning Algorithms

In [51]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [54]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_lr_pred = lr.predict(X_test)

knn_regressor=KNeighborsRegressor(n_neighbors = 5)
knn_model=knn_regressor.fit(X_train,y_train)
y_knn_pred=knn_model.predict(X_test)

rf = RandomForestRegressor()
rf_model = rf.fit(X_train, y_train)
y_rf_pred=rf_model.predict(X_test)

  # Remove the CWD from sys.path while we load stuff.


In [103]:
y_t = []
for val in list(y_test):
  num = str(val)
  num = num[1:-1]
  y_t.append(int(num))

In [107]:
predict_table = pd.DataFrame(columns=['y_lr_pred','y_knn_pred','y_rf_pred', 'average','y_test'])
predict_table['y_test'] = y_t
predict_table['y_lr_pred'] = y_lr_pred
predict_table['y_knn_pred'] = y_knn_pred
predict_table['y_rf_pred'] = y_rf_pred
  
for i, val in enumerate(predict_table['y_lr_pred']):
  predict_table.at[i,'average'] = (int(val) + int(predict_table['y_knn_pred'][i]) + int(predict_table['y_rf_pred'][i]))/3

predict_table

Unnamed: 0,y_lr_pred,y_knn_pred,y_rf_pred,average,y_test
0,93.252379,9.0,102.210000,68,23
1,95.559948,2.8,23.005000,40,0
2,89.942487,86.8,208.016667,127.667,291
3,98.110516,34.4,25.370000,52.3333,794
4,100.350914,4.6,19.110000,41,10
...,...,...,...,...,...
152766,108.132884,1396.2,225.606667,576.333,0
152767,99.894313,6.8,129.048333,78,2
152768,94.980997,663.4,58.340000,271.667,74
152769,136.638485,3.6,99.930000,79.3333,28


## 2. Recommend similar video

For Viewer:      
based on the watched videos, recommend similar video.    
Currently: Content-based recommender    
Can use Collaborative filtering/Hybrid Recommender if users' video ratings available.

In [71]:
# vectorize the keywords
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vector.fit_transform(df['keyword'][:10000]) 
# limit data size to avoid crashing due to high computional power in calculating cosine similarity

In [72]:
# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [73]:
# Indices
indices = pd.Series(df.index,index=df['title']).drop_duplicates()

In [74]:
def single_text_cleaning(text):
  # remove punctuation
  s = text.translate(str.maketrans('', '', string.punctuation))
  word_tokens = word_tokenize(s)
  # remove stopwords
  stop_words = set(stopwords.words('english'))
  filtered_s = [w for w in word_tokens if w not in stop_words]
  filtered_s = ' '.join(filtered_s)
  # stemming
  my_stemmer = LancasterStemmer()
  stemmed = my_stemmer.stem(filtered_s)
  return stemmed

In [75]:
# content-based recommender
def get_recommendation(original_title,cosine_sim=cosine_sim,df=df):
    title = single_text_cleaning(original_title)
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    video_indices = [i[0] for i in sim_scores]
    
    recommendations = df.iloc[video_indices][['title','up_votes','down_votes','over_18','author']]
    recommendations = recommendations.sort_values('up_votes', ascending=False)
    return recommendations

In [76]:
# examples: (title) US presses Egypt on Gaza border
rec_list = get_recommendation("US presses Egypt on Gaza border")
rec_list

Unnamed: 0,title,up_votes,down_votes,over_18,author
7245,south america nations found union,36,0,False,polar
9216,bill presses iraq recognize israel,6,0,False,treebright
55,israel plans egypt border fence,3,0,False,JoeyRamone63
1962,russia medvedev presses nato expansion,2,0,False,twolf1
2068,bread riots egypt,2,0,False,jips
5478,somalis protest us strike,1,0,False,polar
859,venezuela troops move border,1,0,False,DarthTater
9445,israel hamas agree gaza truce egypt claims,0,0,False,M_Rock
5240,iran presses ahead with proposed natural gas c...,0,0,False,jips
5592,babies bodies german freezer,0,0,False,polar
