In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats.api as sms
import scipy.stats as stats
from sklearn import preprocessing
from sklearn.impute import KNNImputer

In [None]:
#load data

In [None]:
connections_df = pd.read_csv('data/connections.csv',delimiter='\t', on_bad_lines='skip')
processes_df = pd.read_csv('data/processes.csv',delimiter='\t', on_bad_lines='skip')
profiles_df = pd.read_csv('data/profiles.csv',delimiter='\t', on_bad_lines='skip')
devices_df = pd.read_csv('data/devices.csv',delimiter='\t', on_bad_lines='skip')

In [None]:
#1)
#A) Analyze daata structures

In [None]:
c1 = connections_df.columns
c2 = processes_df.columns
c3 = profiles_df.columns
c4 = devices_df.columns

print("connections :", c1)
print("processes :", c2)
print("profiles :", c3)
print("devices :", c4)

In [None]:
print("\nSummary for connections:")
print(connections_df.describe())
print("\nSummary for processes:")
print(processes_df.describe())
print("\nSummary for profiles:")
print(profiles_df.describe())
print("\nSummary for devices:")
print(devices_df.describe())

In [None]:
print("\nSummary for profiles:")
print(profiles_df.describe(exclude=np.number))
print("\nSummary for devices:")
print(devices_df.describe(exclude=np.number))

In [None]:
print("\nInfo for connections:")
print(connections_df.info())
print("\nInfo for processes:")
print(processes_df.info())
print("\nInfo for profiles:")
print(profiles_df.info())
print("\nInfo for devices:")
print(devices_df.info())

In [None]:
#Combine datasets
combined_df = connections_df.merge(processes_df, on='imei', how='inner')\
                            .merge(profiles_df, on='imei', how='inner')\
                            .merge(devices_df, on='imei', how='inner')
combined_df = combined_df.drop_duplicates()
print(combined_df.shape)

In [None]:
#B) Attribute analysis
# We chose the selected attributes based on the correlation matrix, using ones that have high correlation with MWRA

In [None]:
selected_attributes_connections = ['c.android.youtube', 'c.android.gm', 'c.android.chrome', 'c.katana', 'c.updateassist'] 
selected_attributes_processes = ['p.android.settings', 'p.android.packageinstaller', 'p.android.gm', 'p.system', 'p.android.externalstorage']

In [None]:
for attribute in selected_attributes_connections:
    sns.histplot(connections_df[attribute])
    plt.title(f'Distribution of {attribute}')
    plt.show()
    print(connections_df[attribute].describe())

In [None]:
for attribute in selected_attributes_processes:
    sns.histplot(processes_df[attribute])
    plt.title(f'Distribution of {attribute}')
    plt.show()
    print(processes_df[attribute].describe())

In [None]:
#C) Pair analysis

In [None]:
connections_numeric_df = connections_df.select_dtypes(include=[np.number])
processes_numeric_df = processes_df.select_dtypes(include=[np.number])

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(connections_numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.title("Correlation Heatmap for Connections Dataset")
plt.show()

In [None]:
columns_to_exclude = ['c.UCMobile.x86', 'c.UCMobile.intl', 'c.raider', 'c.android.vending', 'imei']
filtered_connections_df = connections_numeric_df.drop(columns=columns_to_exclude)
sns.pairplot(filtered_connections_df)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(processes_numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".1f")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.title("Correlation Heatmap for Processes Dataset")
plt.show()

In [None]:
columns_to_exclude = ['p.google', 'p.olauncher', 'p.android.gms', 'p.browser.provider', 'p.process.gapps', 'p.dogalize', 'p.android.vending', 'p.gms.persistent', 'p.android.defcontainer', 'p.simulator', 'p.notifier', 'p.inputmethod.latin', 'p.katana', 'imei']
filtered_processes_df = processes_numeric_df.drop(columns=columns_to_exclude)
sns.pairplot(filtered_processes_df)
plt.show()

In [None]:
#D) Pair analysis focused on predicted attribute

In [None]:
connections_correlations = connections_numeric_df.corr()['mwra'].sort_values()
print("Connections correlations with mwra:\n", connections_correlations)

In [None]:
connections_potential_predictors = connections_correlations.index[-5:]
for predictor in connections_potential_predictors:
    sns.scatterplot(x=connections_df[predictor], y=connections_df['mwra'])
    plt.title(f'Relation between {predictor} and mwra')
    plt.show()

In [None]:
X = connections_df[connections_potential_predictors]
y = connections_df['mwra'] 
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
processes_correlations = processes_numeric_df.corr()['mwra'].sort_values()
print("Processes correlations with mwra:\n", processes_correlations)

In [None]:
processes_potential_predictors = processes_correlations.index[-5:]
for predictor in processes_potential_predictors:
    sns.scatterplot(x=processes_df[predictor], y=processes_df['mwra'])
    plt.title(f'Relation between {predictor} and mwra')
    plt.show()

In [None]:
X = processes_df[processes_potential_predictors] 
y = processes_df['mwra'] 

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
#2
#A identification of problems in data

In [None]:
print("Duplicated records in connections:", connections_df.duplicated().sum())
print("Duplicated records in processes:", processes_df.duplicated().sum())
print("Duplicated records in profiles:", profiles_df.duplicated().sum())
print("Duplicated records in devices:", devices_df.duplicated().sum())

In [None]:
#Check missing values
print("Rows with missing values in connections:", connections_df[connections_df.isnull().any(axis=1)].shape[0])
print("Rows with missing values in processes:", processes_df[processes_df.isnull().any(axis=1)].shape[0])
print("Rows with missing values in profiles:", profiles_df[profiles_df.isnull().any(axis=1)].shape[0])
print("Rows with missing values in devices:", devices_df[devices_df.isnull().any(axis=1)].shape[0])
print("\nMissing values in profiles:")
print(profiles_df.isnull().sum())

In [None]:
for col in connections_df.columns:
    if connections_df[col].dtype == 'object':
        try:
            connections_df[col] = pd.to_numeric(connections_df[col])
        except ValueError:
            print(f"Connections - Non-numeric values found in column {col}")

In [None]:
for col in processes_df.columns:
    if processes_df[col].dtype == 'object':
        try:
            processes_df[col] = pd.to_numeric(processes_df[col])
        except ValueError:
            print(f"Processes - Non-numeric values found in column {col}")

In [None]:
for col in devices_df.columns:
    if devices_df[col].dtype == 'object':
        try:
            devices_df[col] = pd.to_numeric(devices_df[col])
        except ValueError:
            print(f"Devices - Non-numeric values found in column {col}")

In [None]:
for col in profiles_df.columns:
    if profiles_df[col].dtype == 'object':
        try:
            profiles_df[col] = pd.to_numeric(profiles_df[col])
        except ValueError:
            print(f"Profiles - Non-numeric values found in column {col}")

In [None]:
numeric_columns = connections_df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    sns.boxplot(x=connections_df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
numeric_columns = processes_df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    sns.boxplot(x=processes_df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
#B) Missing values

In [None]:
#set treshold as 0.05 -> 5%
#set impute method to knn or mean
def handle_missing_values(df, threshold, impute_method):
    df = df.dropna(thresh=int(len(df) * threshold), axis=1)

    #essential_columns = ['mwra'] 
    #df = df.dropna(subset=essential_columns)
    
    if impute_method == 'mean':
        for col in df.select_dtypes(include=[np.number]).columns:
            df[col] = df[col].fillna(df[col].mean())
    elif impute_method == 'knn':
        knn_imputer = KNNImputer(n_neighbors=5)
        df[df.select_dtypes(include=[np.number]).columns] = knn_imputer.fit_transform(df.select_dtypes(include=[np.number]))
    
    return df

In [None]:
missing_handled_connections = handle_missing_values(connections_df, 0.05, 'mean') 
print("Rows with missing values in profiles:", profiles_df[profiles_df.isnull().any(axis=1)].shape[0])
m_profiles_df = handle_missing_values(profiles_df, 0.85, '') # can not replace values since they are not numeric 
print("Rows with missing values in profiles:", m_profiles_df[m_profiles_df.isnull().any(axis=1)].shape[0])

In [None]:
#C) Outlier detection

In [None]:
#method = remove / replace 
#lower_quantile = 0.5
#upper_quartile = 0.95

def handle_outliers(df, method, lower_quantile, upper_quantile):
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        lower_bound = df[col].quantile(lower_quantile)
        upper_bound = df[col].quantile(upper_quantile)
        
        if method == 'remove':
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        elif method == 'replace':
            df[col] = np.where(df[col] < lower_bound, lower_bound, 
                               np.where(df[col] > upper_bound, upper_bound, df[col]))
    
    return df

In [None]:
o_connections_df = handle_outliers(connections_df, 'replace', 0.5, 0.95)
print(connections_df.info())
print(o_connections_df.info())
