In [3]:
import pandas as pd
import psycopg2
import matplotlib.pyplot as plt
import seaborn as sns

# Replace these values with your database information
dbname = "telecome"
user = "mebmeressa"
password = ""
host = "localhost"
port = 5432  # Default is usually 5432

# Initialize connection and cursor
connection = None
try:
    # Establish a connection
    connection = psycopg2.connect(dbname=dbname, user=user, password=password, host=host, port=port)

    # Replace this query with your own SQL query
    query = 'SELECT * FROM xdr_data'

    # Use Pandas to execute the query and fetch data into a DataFrame
    df = pd.read_sql_query(query, connection)

    # Now 'df' contains your data in a Pandas DataFrame
#     print(df.head())

except psycopg2.Error as e:
    print(f"Error: {e}")

finally:
    # Close the connection in the 'finally' block to ensure it happens
    if connection:
        connection.close()


  df = pd.read_sql_query(query, connection)


# Overview of the Data

In [4]:
# Display basic information about the DataFrame
print(df.info())

# Display basic statistics about the numerical columns
print(df.describe())

# Display the first few rows of the DataFrame
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 56 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   id                                        150001 non-null  int64  
 1   Bearer Id                                 149010 non-null  float64
 2   Start                                     150000 non-null  object 
 3   Start ms                                  150000 non-null  float64
 4   End                                       150000 non-null  object 
 5   End ms                                    150000 non-null  float64
 6   Dur. (ms)                                 150000 non-null  float64
 7   IMSI                                      149431 non-null  float64
 8   MSISDN/Number                             148935 non-null  float64
 9   IMEI                                      149429 non-null  float64
 10  Last Location Name  

# Handle Missing Values:

In [12]:
# Check for missing values
print(df.isnull().sum())

id                                               0
Bearer Id                                      991
Start                                            1
Start ms                                         1
End                                              1
End ms                                           1
Dur. (ms)                                        1
IMSI                                           570
MSISDN/Number                                 1066
IMEI                                           572
Last Location Name                            1153
Avg RTT DL (ms)                              27829
Avg RTT UL (ms)                              27812
Avg Bearer TP DL (kbps)                          1
Avg Bearer TP UL (kbps)                          1
TCP DL Retrans. Vol (Bytes)                  88146
TCP UL Retrans. Vol (Bytes)                  96649
DL TP < 50 Kbps (%)                            754
50 Kbps < DL TP < 250 Kbps (%)                 754
250 Kbps < DL TP < 1 Mbps (%)  

In [None]:
# Drop rows with missing values (if necessary)
#df = df.dropna()

# Fill missing values with a specific value or method (if necessary)
#df = df.fillna(value)


In [7]:
# Assuming 'df' is your DataFrame
# Calculate the percentage of missing values for each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Create a DataFrame to display the results
missing_info = pd.DataFrame({
    'Column': df.columns,
    'Missing Values': df.isnull().sum(),
    'Missing Percentage': missing_percentage
})

# Sort the DataFrame by the percentage of missing values in descending order
missing_info = missing_info.sort_values(by='Missing Percentage', ascending=False)

# Display the results
print(missing_percentage)
# print(missing_info)

id                                           0.000000
Bearer Id                                    0.660662
Start                                        0.000667
Start ms                                     0.000667
End                                          0.000667
End ms                                       0.000667
Dur. (ms)                                    0.000667
IMSI                                         0.379997
MSISDN/Number                                0.710662
IMEI                                         0.381331
Last Location Name                           0.768662
Avg RTT DL (ms)                             18.552543
Avg RTT UL (ms)                             18.541210
Avg Bearer TP DL (kbps)                      0.000667
Avg Bearer TP UL (kbps)                      0.000667
TCP DL Retrans. Vol (Bytes)                 58.763608
TCP UL Retrans. Vol (Bytes)                 64.432237
DL TP < 50 Kbps (%)                          0.502663
50 Kbps < DL TP < 250 Kbps (

# Data Visualization:

In [None]:
# Pairplot for visualizing relationships between numerical variables
sns.pairplot(df)
plt.show()

# Correlation heatmap
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

# Univariate Analysis

In [None]:
# Histogram for a numerical variable
df['column_name'].hist()
plt.show()

# Countplot for a categorical variable
sns.countplot(x='column_name', data=df)
plt.show()


# Bivariate Analysis:

In [None]:
# Scatter plot for two numerical variables
plt.scatter(df['column1'], df['column2'])
plt.xlabel('Column 1')
plt.ylabel('Column 2')
plt.show()

# Boxplot for a numerical variable by a categorical variable
sns.boxplot(x='category_column', y='numeric_column', data=df)
plt.show()


# Outlier Detection

In [None]:
# Boxplot for detecting outliers
sns.boxplot(x='column_name', data=df)
plt.show()


# Feature Engineering:

In [None]:
# Create new features or modify existing ones
df['new_feature'] = df['feature1'] + df['feature2']


# Data Transformation:

In [None]:
# Standardize or normalize numerical columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['numeric_column1', 'numeric_column2']] = scaler.fit_transform(df[['numeric_column1', 'numeric_column2']])


# Explore Categorical Variables:

In [None]:
# Frequency distribution of a categorical variable
df['categorical_column'].value_counts()

# Barplot for a categorical variable
sns.barplot(x='category_column', y='numeric_column', data=df)
plt.show()
