In [1]:
import pandas as pd
from os import path

# Edit path if need be (shouldn't need to b/c we all have the same folder structure)
CSV_PATH = '../VidAnalysis/all_data'
FILE_EXTENSION = '_all.csv'
GENRES = ['country', 'edm', 'pop', 'rap', 'rock']

# Containers for the data frames
genre_dfs = {}
all_genres = None


# Read in the 5 genre's of CV's
for genre in GENRES:
    genre_csv_path = path.join(CSV_PATH, genre) + FILE_EXTENSION
    genre_dfs[genre] = pd.read_csv(genre_csv_path)

all_genres = pd.concat(genre_dfs.values())

# genre_dfs is now a dictionary that contains the 5 different data frames
# all_genres is a dataframe that contains all of the data

In [2]:
def gen_new_headers(old_headers):
    headers = ['colors_' + str(x+1) + '_' for x in range(10)]
    h = []
    for x in headers:
        h.append(x + 'red')
        h.append(x + 'blue')
        h.append(x + 'green')
    return old_headers + h + ['genre']

In [3]:
def genre_to_ordinal(genre_in):
    if(genre_in == "country"):
        return "0"
    elif(genre_in == "pop"):
        return "1"
    elif(genre_in == "rock"):
        return "2"
    elif(genre_in == "edm"):
        return "3"
    elif(genre_in == "rap"):
        return "4"
    else:
        return genre_in
    
all_genres['genre_ordinal'] = all_genres.genre.apply(genre_to_ordinal)

In [4]:
# Adding is_country flag
def is_country(genre_in):
    if(genre_in == "country"):
        return "1"
    else:
        return "0"
    
all_genres['is_country'] = all_genres.genre.apply(is_country)

# Adding is_country flag
def is_rock(genre_in):
    if(genre_in == "rock"):
        return "1"
    else:
        return "0"
    
all_genres['is_rock'] = all_genres.genre.apply(is_rock)

# Adding is_edm flag
def is_edm(genre_in):
    if(genre_in == "edm"):
        return "1"
    else:
        return "0"
    
all_genres['is_edm'] = all_genres.genre.apply(is_edm)

# Adding is_rap flag
def is_rap(genre_in):
    if(genre_in == "rap"):
        return "1"
    else:
        return "0"
    
all_genres['is_rap'] = all_genres.genre.apply(is_rap)

# Adding is_country flag
def is_pop(genre_in):
    if(genre_in == "pop"):
        return "1"
    else:
        return "0"
    
all_genres['is_pop'] = all_genres.genre.apply(is_pop)

In [5]:
import pandas as pd

# Subset all_genres to group by individual genres
country_records  = all_genres[all_genres["genre"] == "country"]
rock_records     = all_genres[all_genres["genre"] == "rock"]
pop_records      = all_genres[all_genres["genre"] == "pop"]
edm_records      = all_genres[all_genres["genre"] == "edm"]
rap_records      = all_genres[all_genres["genre"] == "rap"]

# From the subsets above, create train and test sets from each
country_train = country_records.head(10)
country_test  = country_records.tail(10)
rock_train    = rock_records.head(10)
rock_test     = rock_records.tail(10)
pop_train     = pop_records.head(10)
pop_test      = pop_records.tail(10)
edm_train     = edm_records.head(10)
edm_test      = edm_records.tail(10)
rap_train     = rap_records.head(10)
rap_test      = rap_records.tail(10)

In [6]:
# Create big training and big test set for analysis
training_set = pd.concat([country_train,rock_train,pop_train,edm_train,rap_train])
test_set     = pd.concat([country_test,rock_test,pop_test,edm_test,rap_test])

#training_set = training_set[['rating', 'likes', 'dislikes','genre_ordinal','length','viewcount']]
#test_set = test_set[['rating', 'likes', 'dislikes','genre_ordinal','length','viewcount']]

In [7]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Predicting based solely on non-color features
clf = RandomForestClassifier(n_estimators=10)
meta_data_features = ['rating', 'likes','dislikes','length','viewcount']
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[meta_data_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[meta_data_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[meta_data_features]),rownames=["Actual"], colnames=["Predicted"])

0.54


Predicted,0,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6,0,0,0,4
1,1,0,8,1,0
2,0,4,1,4,1
3,2,1,0,2,5
4,0,2,1,0,7


In [8]:
clf = RandomForestClassifier(n_estimators=10)
color_features = gen_new_headers([])[:-1]

# Predicting based solely on colors
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[color_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[color_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[color_features]),rownames=["Actual"], colnames=["Predicted"])

0.14


Predicted,0,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,5,2,2,1
1,5,1,1,1,2
2,2,2,2,1,3
3,3,1,1,3,2
4,3,1,1,4,1


In [9]:
clf = RandomForestClassifier(n_estimators=10)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])

0.34


Predicted,0,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6,1,0,1,2
1,0,1,6,3,0
2,0,1,2,4,3
3,4,0,0,2,4
4,6,1,0,1,2


In [10]:
# Breaking up into binary classifiers
# Remove certain genres to get better score
# Combine certain genres to get better score

In [11]:
clf = RandomForestClassifier(n_estimators=10)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_pop'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['is_pop'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_pop, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])

0.82


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,39,1
1,8,2


In [12]:
# Ranking performance of boolean classifiers, train and test sets of 50, respectively.
# 1 - is_pop (.84 avg)
# 2 - is_rap (.82 avg, fewer true negatives)
# 3 - is_rock (.78 avg, too many true negatives)
# 4 - is_edm (.--, DO NOT USE. Rarely predicts a positive edm value)
# 5 - is_country (.--, DO NOT USE. Way too many false positives)

In [13]:
# Average score over many iterations calculation
loop_indecies = range(0,50)
cumsum = 0

for i in loop_indecies:
    y, _ = pd.factorize(training_set['is_pop'])
    clf = clf.fit(training_set[all_features], y)

    z, _ = pd.factorize(test_set['is_pop'])
    #print clf.score(test_set[all_features],z)
    cumsum = cumsum + clf.score(test_set[all_features],z)
    #print pd.crosstab(test_set.is_pop, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])
    
print "Average Score for",len(loop_indecies),"is_pop iterations:", cumsum/len(loop_indecies)  

Average Score for 50 is_pop iterations: 0.858


In [14]:
# Average score over many iterations calculation
loop_indecies = range(0,50)
cumsum = 0

for i in loop_indecies:
    y, _ = pd.factorize(training_set['is_rap'])
    clf = clf.fit(training_set[all_features], y)

    z, _ = pd.factorize(test_set['is_rap'])
    #print clf.score(test_set[all_features],z)
    cumsum = cumsum + clf.score(test_set[all_features],z)
    #print pd.crosstab(test_set.is_pop, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])
    
print "Average Score for",len(loop_indecies),"is_rap iterations:", cumsum/len(loop_indecies)  

Average Score for 50 is_rap iterations: 0.7524


In [15]:
def multi_RF_averages(is_genre,num_iterations):
    clf = RandomForestClassifier(n_estimators=10)
    loop_indecies = range(0,num_iterations)
    cumsum = 0

    for i in loop_indecies:
        y, _ = pd.factorize(training_set[is_genre])
        clf = clf.fit(training_set[all_features], y)

        z, _ = pd.factorize(test_set[is_genre])
        cumsum = cumsum + clf.score(test_set[all_features],z)
    
    print "Average Score for",len(loop_indecies),is_genre,"iterations:", cumsum/len(loop_indecies)

In [16]:
multi_RF_averages("is_pop",50)
multi_RF_averages("is_rap",50)
multi_RF_averages("is_rock",50)
multi_RF_averages("is_edm",50)
multi_RF_averages("is_country",50)

Average Score for 50 is_pop iterations: 0.856
Average Score for 50 is_rap iterations: 0.7468
Average Score for 50 is_rock iterations: 0.7764
Average Score for 50 is_edm iterations: 0.7556
Average Score for 50 is_country iterations: 0.746


In [17]:
# Removing EDM for better analysis - makes is_pop and is_rap much more accurate
training_set = pd.concat([country_train,rock_train,pop_train,rap_train])
test_set     = pd.concat([country_test,rock_test,pop_test,rap_test])

multi_RF_averages("is_pop",50)
multi_RF_averages("is_rap",50)
multi_RF_averages("is_rock",50)
multi_RF_averages("is_edm",50)
multi_RF_averages("is_country",50)

Average Score for 50 is_pop iterations: 0.8765
Average Score for 50 is_rap iterations: 0.705
Average Score for 50 is_rock iterations: 0.6805
Average Score for 50 is_edm iterations: 1.0
Average Score for 50 is_country iterations: 0.7165
