In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats.api as sms
import scipy.stats as stats
from sklearn import preprocessing

In [None]:
connections_df = pd.read_csv('data/connections.csv',delimiter='\t', on_bad_lines='skip')
processes_df = pd.read_csv('data/processes.csv',delimiter='\t', on_bad_lines='skip')
profiles_df = pd.read_csv('data/profiles.csv',delimiter='\t', on_bad_lines='skip')
devices_df = pd.read_csv('data/devices.csv',delimiter='\t', on_bad_lines='skip')

In [None]:
c1 = connections_df.columns
c2 = processes_df.columns
c3 = profiles_df.columns
c4 = devices_df.columns

print("connections :", c1)
print("processes :", c2)
print("profiles :", c3)
print("devices :", c4)

In [None]:
print("Rows with missing values in connections:", connections_df[connections_df.isnull().any(axis=1)].shape[0])
print("Rows with missing values in processes:", processes_df[processes_df.isnull().any(axis=1)].shape[0])
print("Rows with missing values in profiles:", profiles_df[profiles_df.isnull().any(axis=1)].shape[0])
print("Rows with missing values in devices:", devices_df[devices_df.isnull().any(axis=1)].shape[0])
print("\nMissing values in profiles:")
print(profiles_df.isnull().sum())

In [None]:
print("\nSummary for connections:")
print(connections_df.describe())
print("\nSummary for processes:")
print(processes_df.describe())
print("\nSummary for profiles:")
print(profiles_df.describe())
print("\nSummary for devices:")
print(devices_df.describe())

In [None]:
print("\nSummary for profiles:")
print(profiles_df.describe(exclude=np.number))
print("\nSummary for devices:")
print(devices_df.describe(exclude=np.number))

In [None]:
print("\nInfo for connections:")
print(connections_df.info())
print("\nInfo for processes:")
print(processes_df.info())
print("\nInfo for profiles:")
print(profiles_df.info())
print("\nInfo for devices:")
print(devices_df.info())

In [None]:
combined_df = connections_df.merge(processes_df, on='imei', how='outer')\
                            .merge(profiles_df, on='imei', how='outer')\
                            .merge(devices_df, on='imei', how='outer')

combined_df = combined_df.drop_duplicates()
print(combined_df.shape)

In [None]:
connections_numeric_df = connections_df.select_dtypes(include=[np.number])
processes_numeric_df = processes_df.select_dtypes(include=[np.number])

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(connections_numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.title("Correlation Heatmap for Connections Dataset")
plt.show()

In [None]:
columns_to_exclude = ['c.UCMobile.x86', 'c.UCMobile.intl', 'c.raider', 'c.android.vending', 'imei']
filtered_connections_df = connections_numeric_df.drop(columns=columns_to_exclude)
sns.pairplot(filtered_connections_df)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(processes_numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".1f")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.title("Correlation Heatmap for Processes Dataset")
plt.show()

In [None]:
columns_to_exclude = ['p.google', 'p.olauncher', 'p.android.gms', 'p.browser.provider', 'p.process.gapps', 'p.dogalize', 'p.android.vending', 'p.gms.persistent', 'p.android.defcontainer', 'p.simulator', 'p.notifier', 'p.inputmethod.latin', 'p.katana', 'imei']
filtered_processes_df = processes_numeric_df.drop(columns=columns_to_exclude)
sns.pairplot(filtered_processes_df)
plt.show()