In [97]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [184]:
raw_path = "C:/Users/j3543/Desktop/DSC180A-B05-Project-main/data/raw/"
out_path = "C:/Users/j3543/Desktop/DSC180A-B05-Project-main/data/out/result.csv"

In [185]:
import sys
import os
import pandas as pd
import numpy as np
import sys
import json
from scipy.signal import find_peaks

def cal_attributes(entries, raw_path,out_path):
    # I extracted the features based on the columns below.
    cols = ['Time','1->2Bytes','2->1Bytes','1->2Pkts','2->1Pkts']
    temp = pd.read_csv(raw_path + entries[0])[cols]
    # I used the describe() function to find the attributes 
    #of each column, which includes mean, median, max min etc.
    desc = temp.describe()
    new_cols = {}
    
    # I used the for loop to make a dictionary with only keys of 
    #the generated features name from the function describe().
    for col in desc.columns:
        temp_col = {}
        name = ""
        for ind in desc.index:
            name = str(col) + "_" + str(ind)
            new_cols[name] = []
            
    # Adding more keys related to peaks to the dictionary 
    peak_col = ['1->2Bytes','2->1Bytes','1->2Pkts','2->1Pkts']
    for i in peak_col:
            name = str(i) + "_" + "peak_num"
            new_cols[name] = []
    
    # I used the for loop to populate the dictionary created above 
    #with the data from describe() and find_peaks.
    for i in entries:
        p144 = pd.read_csv(raw_path + i)
        temp = p144[cols]
        
        #Here I used the find_peak to count the number of spikes 
        #of each column except Time.
        for i in peak_col:
            peaks, _ = find_peaks(temp[i], height=0)
            name = str(i) + "_" + "peak_num"
            item = len(peaks)
            new_cols[name] = new_cols[name] + [item]
            
        #This block of code is used to put the data generated by 
        #describe() into the dictionary.
        desc = temp.describe()
        for col in desc.columns:
            temp_col = {}
            name = ""
            for ind in desc.index:
                name = str(col) + "_" + str(ind)
                item = desc[col][ind]
                new_cols[name] = new_cols[name] + [item]
    # The function returns a tabel of features for each file.
    feats = pd.DataFrame.from_dict(new_cols)
    feats.to_csv(out_path + "cleaned_features.csv", index = False, header=True)
    return feats

def cal_labels(entries,raw_path,out_path):
    #This for loop is to convert the list of names of the files into a 2d 
    #lists called new_str.
    #Each element in this list is a list of cleaned labels of the file.
    new_str = []
    for i in entries:
        # The following process replace the wrong formating and wording 
        #into the correct format.
        temp=i.replace('[','')
        temp=temp.replace(']','')
        temp=temp.replace(' ','')
        temp=temp.replace('_','-')
        for i in range(10):
            string = "-"+str(i) + ".csv"
            temp=temp.replace(string,'')
        temp=temp.replace('.csv','')
        temp=temp.replace('nonvpn','novpn')
        for num in range(10):
            temp=temp.replace('('+str(num)+')','')
        str_list=temp.split('-')
        new_str.append(str_list)
    labels = {'username','video/novideo','streaming_provider','quality','playback_speed','vpn/novpn','platform','clean/noisy','date'}

    big_list = set(list(np.concatenate(new_str).flat))
    #For each of the labels, I created a empty list which would be 
    #populated later using if statements.
    username = [i[0] for i in new_str]
    video = []
    streaming_provider = []
    quality = []
    playback_speed = []
    vpn = []
    platform = []
    clean = []
    date = [i[-1][:8] for i in new_str]
    
    #This for look check every entries of new_str(the cleaned list of 
    #file names) and populate each of labels' array one by one.
    for i in new_str:
        #For example, if "nonvpn" is in this entry, then the "nonvpn"
        #would be appended to the vpn/novpn list.
        if "novpn" in i:
            vpn.append("novpn")
        else:
            vpn.append("vpn")
        #The following if statements have the same logic as the above
        #if statement.
        if "novideo" in i:
            video.append("novideo")
        else:
            video.append("video")
        
        if "youtube" in i:
            streaming_provider.append("youtube")
        elif "hulu" in i:
            streaming_provider.append("hulu")
        elif "amazon" in i:
            streaming_provider.append("amazonprime")
        elif "netflix" in i:
            streaming_provider.append("netflix")
        elif "bilibili" in i:
            streaming_provider.append("bilibili")
        else:
            streaming_provider.append("NA")

        if "clean" in i:
            clean.append("clean")
        elif "noisy" in i:
            clean.append("noisy")
        else:
            clean.append("NA")

        counter = 0
        for j in i:
            if counter == len(i)-1:
                quality.append("NA")
                break
            if "1080" in j:
                quality.append("1080p")
                break
            elif "2160" in j:
                quality.append("2160p")
                break
            elif "240" in j:
                quality.append("240p")
                break
            elif "480" in j:
                quality.append("480p")
                break
            elif "144" in j:
                quality.append("144p")
                break
            elif "320" in j:
                quality.append("320p")
                break
            elif "720" in j:
                quality.append("720p")
                break
            counter = counter + 1

        if "windows" in i:
            platform.append("windows")
        elif "mac" in i:
            platform.append("mac")
        elif "linux" in i:
            platform.append("linux")
        else:
            platform.append("NA")

        if ("1x" or "1") in i:
            playback_speed.append("1x")
        elif ("2x" or "2" or "2.0x") in i:
            playback_speed.append("2x")
        elif ("1.5x" or "1.5x" or "1.5x") in i:
            playback_speed.append("1.5x")
        elif ("1.4x" or "1.4"or "1.40x") in i:
            playback_speed.append("1.4x")
        elif ("0.5x" or "0.5") in i:
            playback_speed.append("0.5x")
        elif ("0.25x" or "0.25") in i:
            playback_speed.append("0.25x")
        elif ("1.25x" or "1.25") in i:
            playback_speed.append("1.25x")
        elif ("1.75x" or "1.75") in i:
            playback_speed.append("1.75x")
        else:
            playback_speed.append("NA")
    
    # Finally, we creat a dictionary of each label corresponding 
    #to tehir lists for all files 
    # and then convert this dictionary to table as the resturned result.
    labels = {'username':username,'video/novideo':video,'streaming_provider':streaming_provider,'quality':quality,'playback_speed':playback_speed,'vpn/novpn':vpn,'platform':platform,'clean/noisy':clean,'date':date}
    label_table = pd.DataFrame.from_dict(labels)
    label_table.to_csv(out_path + 'cleaned_labels.csv', index = False, header=True)
    return label_table

def combine_output(entries,raw_path, out_path):
    df1 = cal_attributes(entries,raw_path, out_path)
    df2 = cal_labels(entries,raw_path, out_path)
    horizontal_stack = pd.concat([df2, df1], axis=1)
    horizontal_stack.to_csv(out_path + "combined_result.csv", index = False, header=True)
    return horizontal_stack

def original_out(raw_path,out_path):
    new1 = []
    new2 = []
    new3 = []
    new4 = []
    entries = os.listdir(raw_path)
    #entries.remove('.git')
    cal_labels(entries,raw_path,out_path).to_csv(out_path+"original_out.csv", index = False, header=True)
    return entries


In [186]:
ok = original_out(raw_path,out_path)

In [187]:
df =  combine_output(ok,raw_path, out_path)

In [63]:
labels = pd.read_csv("cleaned_labels.csv")

In [104]:
features = pd.read_csv("cleaned_features.csv")

In [105]:
labels

Unnamed: 0,username,video/novideo,streaming_provider,quality,playback_speed,vpn/novpn,platform,clean/noisy,date
0,arv020,1,,480p,1x,vpn,mac,clean,20201101
1,imnemato,1,,1080p,1x,vpn,mac,clean,20201031
2,stdoan,0,,,,vpn,windows,,20201029
3,jeq004,1,netflix,1080p,1x,vpn,mac,clean,20201101
4,imnemato,1,,1080p,1x,vpn,mac,clean,20201031
...,...,...,...,...,...,...,...,...,...
174,chy238,1,,,1x,vpn,windows,clean,20201103
175,zij034,1,youtube,,1x,vpn,windows,clean,20201101
176,pgaddiso,1,youtube,720p,1x,vpn,linux,clean,20201102
177,dyaseen,1,,,,novpn,windows,clean,20201102


In [106]:
features = features.iloc[:, 8:43]
features

Unnamed: 0,1->2Bytes_count,1->2Bytes_mean,1->2Bytes_std,1->2Bytes_min,1->2Bytes_25%,1->2Bytes_50%,1->2Bytes_75%,1->2Bytes_max,2->1Bytes_count,2->1Bytes_mean,...,2->1Pkts_mean,2->1Pkts_std,2->1Pkts_min,2->1Pkts_25%,2->1Pkts_50%,2->1Pkts_75%,2->1Pkts_max,1->2Bytes_peak_num,2->1Bytes_peak_num,1->2Pkts_peak_num
0,39823.0,522.630917,869.631139,0.0,190.00,324.0,454.00,20837.0,39823.0,674.446300,...,0.542651,12.884026,0.0,0.0,0.0,0.00,833.0,11793,998,4428
1,208.0,10252.245192,25378.133865,32.0,217.00,436.0,15233.75,197377.0,208.0,438950.370192,...,327.764423,508.281720,0.0,0.0,1.5,752.25,1680.0,72,64,63
2,1670.0,1753.118563,4826.919546,0.0,61.00,152.0,1584.50,83230.0,1670.0,40290.303593,...,30.295808,131.456713,0.0,1.0,1.0,7.00,2131.0,464,463,404
3,84.0,1613.297619,4322.107451,0.0,224.25,291.0,667.50,26393.0,84.0,10502.976190,...,9.404762,65.349868,0.0,0.0,0.0,0.25,596.0,21,18,15
4,225.0,11213.537778,26657.856416,0.0,262.00,434.0,11994.00,150232.0,225.0,344586.622222,...,257.297778,455.162388,0.0,0.0,1.0,361.00,1596.0,63,56,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,324.0,23491.459877,33257.810287,0.0,1884.75,9807.0,33746.25,239075.0,324.0,375604.037037,...,280.444444,397.938239,0.0,5.0,111.5,450.75,3496.0,94,98,98
175,468.0,13646.286325,46535.150984,0.0,133.75,424.0,1008.00,422098.0,468.0,159171.989316,...,120.788462,442.261819,0.0,0.0,0.0,2.00,5162.0,140,100,102
176,759.0,3038.623188,13049.231975,0.0,136.00,160.0,385.00,118960.0,759.0,63256.405797,...,49.131752,255.463831,0.0,0.0,0.0,1.00,1934.0,245,179,190
177,2191.0,3566.839343,7560.993421,0.0,72.00,190.0,1921.00,38700.0,2191.0,145413.570972,...,97.552716,238.076533,0.0,0.0,1.0,3.00,1151.0,649,582,553


In [107]:
labels = labels.replace(["video", "novideo"], [1, 0])

In [108]:
y = labels.iloc[:,1]

In [163]:
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.15, shuffle = True)

In [164]:
model1 = RandomForestClassifier(random_state=1)

In [165]:
model1.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [166]:
y_pred = model1.predict(X_test)
# evaluate predictions
acc = accuracy_score(y_test, y_pred)
print('Accuracy: {}'.format(acc))

Accuracy: 0.9259259259259259


In [180]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [173]:
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.15, shuffle = True)

In [175]:
model2.fit(X_train, y_train)
y_pred2 = model1.predict(X_test)
# evaluate predictions
acc2 = accuracy_score(y_test, y_pred2)
print('Accuracy: {}'.format(acc2))

Accuracy: 0.9629629629629629


