### Data Collection

In [98]:
import pandas as pd
from os import path
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
import sklearn

# Edit path if need be (shouldn't need to b/c we all have the same folder structure)
CSV_PATH_1 = '../Videos/all_data'
CSV_PATH_2 = '../Videos2/all_data2'
FILE_EXTENSION = '_all.csv'
GENRES = ['country', 'edm', 'pop', 'rap', 'rock']

# Containers for the data frames
genre_dfs = {}
all_genres = None

# Read in the 5 genre's of CV's
for genre in GENRES:
    genre_csv_path_1 = path.join(CSV_PATH_1, genre) + FILE_EXTENSION
    genre_csv_path_2 = path.join(CSV_PATH_2, genre) + FILE_EXTENSION
    df_1 = pd.read_csv(genre_csv_path_1)
    df_2 = pd.read_csv(genre_csv_path_2)
    df_1 = df_1.drop('Unnamed: 0',1)
    df_2 = df_2.drop('Unnamed: 0',1)
    df_combined = pd.concat([df_1,df_2],ignore_index=True)
    genre_dfs[genre] = df_combined

all_genres = pd.concat(genre_dfs.values())
all_genres.head()

# genre_dfs is now a dictionary that contains the 5 different data frames
# all_genres is a dataframe that contains all of the data

Unnamed: 0,filename,author,description,viewcount,rating,likes,dislikes,duration,length,keywords,...,colors_8_red,colors_8_blue,colors_8_green,colors_9_red,colors_9_blue,colors_9_green,colors_10_red,colors_10_blue,colors_10_green,genre
0,Luke Bryan - Roller Coaster.mp4,LukeBryanVEVO,Luke Bryan - Crash My Party\nPurchase now on i...,28948653,4.840108,127866,5324,00:04:23,263,"[Luke, Bryan, Roller, Coaster, Capitol, Record...",...,230,210,190,90,70,70,240,220,200,country
1,Dierks Bentley - Drunk On A Plane.mp4,DierksBentleyVEVO,Purchase Dierks Bentley’s latest music: http:/...,41548786,4.763639,140682,8835,00:04:51,291,"[Dierks, Bentley, Drunk, On, Plane, Capitol, R...",...,70,50,50,100,110,120,90,70,70,country
2,Thomas Rhett - Get Me Some Of That.mp4,ThomasRhettVEVO,Music video by Thomas Rhett performing Get Me ...,43868160,4.826069,128488,5841,00:03:13,193,"[Thomas, Rhett, Get, Me, Some, Of, That, The, ...",...,40,50,30,50,70,50,40,60,50,country
3,David Nail - Whatever She's Got.mp4,DavidNailVEVO,Purchase David Nail’s latest music: http://umg...,48648247,4.826632,141108,6393,00:04:01,241,"[David, Nail, Whatever, She's, Got, MCA, Nashv...",...,60,50,30,60,40,30,70,60,50,country
4,Joe Nichols - Yeah.mp4,JoeNicholsVEVO,Joe Nichols - Yeah\n“Yeah” from Joe Nichol’s C...,11397694,4.815725,33255,1606,00:03:52,232,"[Joe Nichols, Red Bow Records, Country, Yeah]",...,30,50,50,20,40,60,20,30,50,country


### Ordinal Genres
Below, we make the genres ordinal to fit in the random forest classifiers. We add a new column to our dataframe to do so, write a function to populate it, and run it across the dataframe.

In [99]:
def genre_to_ordinal(genre_in):
    if(genre_in == "country"):
        return 0
    elif(genre_in == "pop"):
        return 1
    elif(genre_in == "rock"):
        return 2
    elif(genre_in == "edm"):
        return 3
    elif(genre_in == "rap"):
        return 4
    else:
        return genre_in
    
all_genres['genre_ordinal'] = all_genres.genre.apply(genre_to_ordinal)

We add in some boolean genre classifiers to make our analysis more fine-grained. Rather than saying "we predict this video is country with 50% confidence", we could say "we predict this video is not edm with 90% confidence" and so on.

In [100]:
# Adding is_country flag
def is_country(genre_in):
    if(genre_in == "country"):
        return 1
    else:
        return 0
    
all_genres['is_country'] = all_genres.genre.apply(is_country)

# Adding is_country flag
def is_rock(genre_in):
    if(genre_in == "rock"):
        return 1
    else:
        return 0
    
all_genres['is_rock'] = all_genres.genre.apply(is_rock)

# Adding is_edm flag
def is_edm(genre_in):
    if(genre_in == "edm"):
        return 1
    else:
        return 0
    
all_genres['is_edm'] = all_genres.genre.apply(is_edm)

# Adding is_rap flag
def is_rap(genre_in):
    if(genre_in == "rap"):
        return 1
    else:
        return 0
    
all_genres['is_rap'] = all_genres.genre.apply(is_rap)

# Adding is_country flag
def is_pop(genre_in):
    if(genre_in == "pop"):
        return 1
    else:
        return 0
    
all_genres['is_pop'] = all_genres.genre.apply(is_pop)

### Test and Train Sets
We create our training and test sets by splitting all_genres by genre, and making 10 of each genre train and 10 test. We aggregate by genre to make our full train and full test sets, each containing 50 records of various genres.

In [101]:
# Subset all_genres to group by individual genres
country_records  = all_genres[all_genres["genre"] == "country"]
rock_records     = all_genres[all_genres["genre"] == "rock"]
pop_records      = all_genres[all_genres["genre"] == "pop"]
edm_records      = all_genres[all_genres["genre"] == "edm"]
rap_records      = all_genres[all_genres["genre"] == "rap"]

# From the subsets above, create train and test sets from each
country_train = country_records.head(len(country_records) / 2)
country_test  = country_records.tail(len(country_records) / 2)
rock_train    = rock_records.head(len(rock_records) / 2)
rock_test     = rock_records.tail(len(rock_records) / 2)
pop_train     = pop_records.head(len(pop_records) / 2)
pop_test      = pop_records.tail(len(pop_records) / 2)
edm_train     = edm_records.head(len(edm_records) / 2)
edm_test      = edm_records.tail(len(edm_records) / 2)
rap_train     = rap_records.head(len(rap_records) / 2)
rap_test      = rap_records.tail(len(rap_records) / 2)

# Create big training and big test set for analysis
training_set = pd.concat([country_train,rock_train,pop_train,edm_train,rap_train])
test_set     = pd.concat([country_test,rock_test,pop_test,edm_test,rap_test])

training_set = training_set.fillna(0)
test_set = test_set.fillna(0)

print "Training Records:\t" , len(training_set)
print "Test Records:\t\t" , len(test_set)
# training_set.head()

Training Records:	405
Test Records:		405


### Generating Random Forest - Viewer Statistics
We start generating our random forests, and output a relative accuracy and a confusion matrix. In this first one, we simply factor in non-color variables (rating, likes, dislikes, length and viewcount), and run it across all records to predict an ordinal genre value.

In [102]:
# Predicting based solely on non-color features, using RF
clf = RandomForestClassifier(n_estimators=11)
meta_data_features = ['rating', 'likes','dislikes','length','viewcount']
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[meta_data_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[meta_data_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[meta_data_features]),rownames=["Actual"], colnames=["Predicted"])

0.432098765432


Predicted,0,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,46,7,1,22,5
1,3,21,43,6,8
2,29,9,7,27,3
3,7,17,26,24,9
4,10,12,2,8,53


As shown above, this method yields relatively poor results. This is because there's no distinct clusters being created by our random forest, and simple viewer statistics tell us nothing about what kind of video we're watching. However, we see that country, rap and pop are initially somewhat distinct (diagonal is the highest value), and rock and edm are getting mistaken for one another. Let's see if we can't make something of this.

### Random Forest - Only Color Statistics
Below, we do the same random forest as above, but going strictly off of average frame color for the video.

We found the most commonly appearing color in each frame and called it the 'frame mode'. We then took all of the frame modes and found the 10 most common of them. Those became the 'color data' we use to analyze videos.

In [6]:
def gen_new_headers(old_headers):
    headers = ['colors_' + str(x+1) + '_' for x in range(10)]
    h = []
    for x in headers:
        h.append(x + 'red')
        h.append(x + 'blue')
        h.append(x + 'green')
    return old_headers + h + ['genre']

In [7]:
clf = RandomForestClassifier(n_estimators=11)
color_features = gen_new_headers([])[:-1]

# Predicting based solely on colors
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[color_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[color_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[color_features]),rownames=["Actual"], colnames=["Predicted"])

0.222222222222


Predicted,0,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,22,23,10,15,11
1,18,10,16,21,16
2,27,21,9,10,8
3,20,8,23,18,14
4,19,14,22,17,13


This actually yields worse results than just the viewer statistics, because the color of a video by itself does not determine the genre. If rappers only had red in their videos and rockers only had black this might be somewhat accurate, but that's just not the case. But, what if we pair these findings with our initial viewer statistics? 

### Random Forest - All Features

In [103]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])

0.402469135802


Predicted,0,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,51,5,1,18,6
1,12,17,35,15,2
2,37,17,3,9,9
3,14,14,18,27,10
4,28,11,2,11,33


### Singling Out Pop and Rap
Scores are expectedly low. It seems as if we're trying to make the classifier do way too much work, and are giving it very mediocre data to go off of. Recall that we're actually trying to determine WHICH genre a video is by the above code, not whether or not a video is of ONE specific genre. This brings back the binary classifiers that we created above, let's put those to use to see if we can improve these scores.

We try pop and rap first, since they seem to be the most distinct by what we've gathered above.

In [104]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_pop'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['is_pop'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_pop, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])

0.8


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,305,19
1,62,19


In [105]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_rap'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['is_rap'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_rap, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])

0.755555555556


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,281,39
1,60,25


What we're seeing above is a confusion matrix that, based on our training data, predicts whether or not a video in the test set is a pop video or not. In the "predicted" row, 0 means it predicts it's not a pop video, and that the 1 is. Likewise with the actual, 0 shows that the video actually wasn't a pop video, and the 1 shows that it was.

The confusion matrix above is our first effort at utilizing these binary classifiers. Most of our videos aren't pop videos, and the model did a good job of picking out those that aren't pop. However, we could use some improvement in the realm of "false negatives", where the model classified a video as not pop when it actually was.

We do these tests 50 times for sake of average score.

Rather than hard-coding each time we wanted to run something for average, we wrote a function that does it for us. All we have to do is pass in the boolean classifier in quotes ("is_rock", etc.), and the number of iterations that we want. Results are displayed below.

In [106]:
def multi_RF_averages(is_genre,num_iterations):
    clf = RandomForestClassifier(n_estimators=11)
    loop_indices = range(0,num_iterations)
    cumsum = 0

    for i in loop_indices:
        y, _ = pd.factorize(training_set[is_genre])
        clf = clf.fit(training_set[all_features], y)

        z, _ = pd.factorize(test_set[is_genre])
        cumsum = cumsum + clf.score(test_set[all_features],z)
    
    print "Average Score for",len(loop_indices),is_genre,"iterations:", cumsum/len(loop_indices)

In [107]:
multi_RF_averages("is_pop",50)
multi_RF_averages("is_rap",50)
multi_RF_averages("is_rock",50)
multi_RF_averages("is_edm",50)
multi_RF_averages("is_country",50)

Average Score for 50 is_pop iterations: 0.812790123457
Average Score for 50 is_rap iterations: 0.780641975309
Average Score for 50 is_rock iterations: 0.814814814815
Average Score for 50 is_edm iterations: 0.757283950617
Average Score for 50 is_country iterations: 0.790271604938


We ran the above test with all genres, and as shown in above analysis, our country and edm typically have very low accuracy. We've seen above that edm and rock videos are getting mixed up with one another, so we assume that something is characteristic of these 2 genres that's not of everything else. We take out the edm values from our training and test datasets, hoping to improve accuracy.

In [108]:
# Removing EDM for better analysis - makes is_pop and is_rap much more accurate
training_set = pd.concat([country_train,rock_train,pop_train,rap_train])
test_set     = pd.concat([country_test,rock_test,pop_test,rap_test])

multi_RF_averages("is_pop",50)
multi_RF_averages("is_rap",50)
multi_RF_averages("is_rock",50)
multi_RF_averages("is_edm",50)
multi_RF_averages("is_country",50)

Average Score for 50 is_pop iterations: 0.845093167702
Average Score for 50 is_rap iterations: 0.745900621118
Average Score for 50 is_rock iterations: 0.768260869565
Average Score for 50 is_edm iterations: 1.0
Average Score for 50 is_country iterations: 0.744720496894


So, what does this tell us? Based on our training data, we have the best chance of accurately classifying something as pop or not pop (under these conditions). 

We want to find out which 2 are the most distinct, so we can make build our model based on that classification.

In [109]:
training_set = pd.concat([country_train,rock_train,edm_train,rap_train,pop_train])

test_set     = pd.concat([rock_test])
multi_RF_averages("is_rock",50)

test_set     = pd.concat([rap_test])
multi_RF_averages("is_rap",50)

test_set     = pd.concat([country_test])
multi_RF_averages("is_country",50)

test_set     = pd.concat([pop_test])
multi_RF_averages("is_pop",50)

test_set     = pd.concat([edm_test])
multi_RF_averages("is_edm",50)

Average Score for 50 is_rock iterations: 0.813333333333
Average Score for 50 is_rap iterations: 0.721882352941
Average Score for 50 is_country iterations: 0.21975308642
Average Score for 50 is_pop iterations: 0.716296296296
Average Score for 50 is_edm iterations: 0.921686746988


Rock and EDM have suprisingly distinct classifiers. We should dive into the videos and see what this means.

In [17]:
test_set     = pd.concat([edm_test,rock_test])
multi_RF_averages("is_edm",50)
multi_RF_averages("is_rock",50)

Average Score for 50 is_edm iterations: 0.520632911392
Average Score for 50 is_rock iterations: 0.589367088608


### Selecting Most Valuable Features per Genre - Rock

In [112]:
model = ExtraTreesClassifier()

training_set = pd.concat([country_train,pop_train,rap_train,rock_train,edm_train])
y, _ = pd.factorize(training_set['is_rock'])
model.fit(training_set[all_features], y)

# display the relative importance of each attribute
print model.feature_importances_

[ 0.03360183  0.02829108  0.02982243  0.0490384   0.02047459  0.01937606
  0.02202715  0.04239438  0.03467993  0.02199705  0.03537056  0.02456637
  0.02885381  0.02835152  0.02870449  0.01953121  0.02602534  0.02756057
  0.02962556  0.02307043  0.0219043   0.0273488   0.03731047  0.03296353
  0.02753547  0.02877794  0.02571059  0.02788969  0.03097814  0.01596579
  0.03276818  0.03190175  0.0351186   0.03148952  0.01897448]


In [236]:
df = pd.DataFrame()
df['index'] = all_features

y, _ = pd.factorize(training_set['is_rap'])
model.fit(training_set[all_features], y)
        
df['rap'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_rock'])
model.fit(training_set[all_features], y)

df['rock'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_country'])
model.fit(training_set[all_features], y)

df['country'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_edm'])
model.fit(training_set[all_features], y)

df['edm'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_pop'])
model.fit(training_set[all_features], y)

df['pop'] = model.feature_importances_

df

Unnamed: 0,index,rap,rock,country,edm,pop
0,rating,0.052303,0.025419,0.042144,0.048553,0.024148
1,likes,0.04282,0.038282,0.048668,0.034959,0.156237
2,dislikes,0.028491,0.034001,0.038813,0.033244,0.115148
3,length,0.041854,0.048102,0.033041,0.024917,0.013564
4,viewcount,0.064045,0.030155,0.050966,0.028763,0.162365
5,colors_1_red,0.0242,0.031048,0.022437,0.02958,0.018663
6,colors_1_blue,0.016276,0.025953,0.026887,0.031967,0.011435
7,colors_1_green,0.028815,0.034242,0.025065,0.027654,0.020421
8,colors_2_red,0.028509,0.017323,0.024548,0.025999,0.021517
9,colors_2_blue,0.028602,0.018365,0.022685,0.024641,0.012562


### Future Improvements 
- Run the above graph a number of times, take the average for each cell
- Based on the heaviest weighted parameters for each, run the random forest algorithm only taking these given parameters into consideration
- Generate a model that classifies videos dynamically
- Make more values ordinal - maybe to NLP or LDA to factor in descriptions, titles and lyrics