In [1]:
import pandas as pd
import nltk
import numpy as np
import re
from gensim.models import Word2Vec
import lightgbm as lgb
from sklearn.feature_selection import RFE

In [2]:
# The functions to transform permission vector to numberical label or vice versa.

limited_per_list = ['take pictures and videos','read the contents of your USB storage', 'record audio',
 'read your contacts','approximate location (network-based)',
 'precise location (GPS and network-based)']
def vector_to_label(per_vector):
    i = len(per_vector) - 1
    j = 0
    label = 0
    while i >= 0 :
        label += 2**(i) * per_vector[j]
        i -= 1
        j += 1
    return label

def label_to_vector(label):
    res = [int(i) for i in list('{0:0b}'.format(label))]
    while len(res) != 6:
        res.insert(0,0)
    return res


In [3]:
# The functions to prepare dataframe for feeding into gbm
def transform_per(raw_permission):
    p_list = raw_permission.split(",")
    i = 0
    per_vector = [0,0,0,0,0,0]
    while (i < len(p_list)):
        j = 0
        while (j < len(limited_per_list)):
            if(p_list[i] == limited_per_list[j]):
                per_vector[j] = 1
            j += 1
        i += 1
    return per_vector

def des_to_vector(raw_description):
    global word_list
    des_list = raw_description.split()
    i = 0
    des_vector = [0]*len(word_list)
    while i < len(des_list):
        j = 0
        while j < len(word_list):
            if(des_list[i] == word_list[j]):
                des_vector[j] = 1
            j += 1
        i += 1
    return des_vector

def process_df(df):
    df = df.dropna()
    df.reset_index(inplace=True)
    new_per_col = df['AppPermission'].map(transform_per)
    df['AppPermission'] = new_per_col
    model = Word2Vec.load("word2vec.model")
    vocab_obj = model.wv.vocab
    global word_list
    word_list = vocab_obj.keys()
    word_list = list(word_list)
    new_des_col = df['AppDescription'].map(des_to_vector)
    df['AppDescription'] = new_des_col
    df.reset_index(inplace=True)
    i = 0
    while i < len(word_list):
        j = 0
        new_col = [0] * df['AppDescription'].size
        while j < df['AppDescription'].size:
            new_col[j] = df['AppDescription'][j][i]
            j += 1
        df[word_list[i]] = pd.Series(new_col)
        i += 1
    df = df.drop(['level_0','index'],axis=1)
    df['Per_Label'] = df['AppPermission'].map(vector_to_label)

    return df

In [4]:
test_df = pd.read_csv("data/test_data.csv")
ref_df = pd.read_csv("data/raw_reference_data.csv")

In [5]:
test_df = process_df(test_df)
ref_df = process_df(ref_df)
train = ref_df.drop(['AppID','AppDescription','AppPermission','Per_Label'],axis=1)
test = test_df.drop(['AppID','AppDescription','AppPermission','Per_Label'],axis=1)
label = ref_df['Per_Label']
label_test = test_df['Per_Label']
test_df.to_csv("data/test_data_processed.csv")
ref_df.to_csv("data/ref_data_processed.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AppPermission'] = new_per_col
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AppDescription'] = new_des_col
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[word_list[i]] = pd.Series(new_col)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

In [9]:
## Calculate the feature importance and drop the features that have low impact
gbm = lgb.LGBMClassifier()
gbm.fit(train, label)
gbm.booster_.feature_importance()
fea_imp_ = pd.DataFrame({'cols':train.columns, 'fea_imp':gbm.feature_importances_})
new_fea_imp = fea_imp_.loc[fea_imp_.fea_imp > 0].sort_values(by=['fea_imp'], ascending = False)
new_ref_train = train[new_fea_imp['cols']]
new_test = test[new_fea_imp['cols']]

In [10]:
## New interation of training and predicting
gbm = lgb.LGBMClassifier()
gbm.fit(new_ref_train, label)
y_  = gbm.predict(new_ref_train)

In [11]:
i = 0
err = 0
while i < len(y_):
    if(y_[i] != label[i]):
    
        err += 1
    i += 1
print((i-err)/i * 100)

99.67532467532467


In [23]:
y_pop  = gbm.predict(new_test)

In [24]:
# To evaluate the Pop set, we use following scoring laws:
# 1. If Pop App uses same or less Permissions as Ref, A Green Light.
# 2. If Pop App uses more Permissions as Ref, A Red Light.
# 3. If Pop App uses some less and some more Permissions as Ref, we consider the prediction failed, as Black Light.
i = 0
Green = 0
Red = 0
Black = 0
while i < len(y_pop):
    if(y_pop[i] == label_test[i]):
        Green += 1
    else:
        y_vec = label_to_vector(y_pop[i])
        label_vec = label_to_vector(label_test[i])
        j = 0
        MORE = False
        LESS = False
        FAIL = False
        while j < 6:
            if(y_vec[j] == label_vec[j]):
                j += 1
                continue
            if(y_vec[j] > label_vec[j]):
                if LESS:
                    Black += 1
                    FAIL = True
                    break
                else:
                    MORE = True
                    j += 1
                    continue
            if(y_vec[j] < label_vec[j]):
                if MORE:
                    Black += 1
                    FAIL = True
                    break
                else:
                    LESS = True
                    j += 1
                    continue
        if not FAIL:
            if MORE:
                Red += 1
            else:
                Green += 1
    i += 1

print(i)
print(Green)
print(Red)
print(Black)

1470
1093
250
127
