In [150]:
import re
import os
import sys
import time
import logging
import subprocess
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from datetime import datetime

# Global DataFrame to store log entries
log_df = pd.DataFrame(columns=['Timestamp', 'Level', 'Message'])

class DataFrameLoggingHandler(logging.Handler):
    def __init__(self, dataframe):
        super().__init__()
        self.dataframe = dataframe

    def emit(self, record):
        timestamp = pd.Timestamp.now()
        level = record.levelname
        # Use the actual log message from the record
        message = record.getMessage()  
        self.dataframe.loc[len(self.dataframe)] = [timestamp, level, message]

def setup_logging(log_file='app_log.log', log_level=logging.INFO):
    """
    Set up logging configuration to log to a file and console, and return a logger.
    
    Parameters:
    log_file (str): The name of the log file where logs will be saved.
    log_level (int): The log level (INFO, DEBUG, etc.).
    
    Returns:
    logger (logging.Logger): Configured logger.
    """
    # Remove any existing handlers attached to the root logger
    logging.root.handlers.clear()

    logger = logging.getLogger()
    logger.setLevel(log_level)

    # Define handlers
    handlers = [
        logging.FileHandler(log_file),
        logging.StreamHandler(),
        DataFrameLoggingHandler(log_df)
    ]
    
    for handler in handlers:
        handler.setLevel(log_level)
        handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        logger.addHandler(handler)

    return logger

def log_imported_libraries(libraries, logger):
    """
    Log the imported libraries with their aliases.
    
    Parameters:
    libraries (dict): Dictionary of library names and their aliases.
    logger (logging.Logger): Logger to log the messages.
    """
    for lib, alias in libraries.items():
        logger.info(f"Importing '{lib}' as '{alias}'" if alias else f"Importing '{lib}' without alias")

# Set up the logger
logger = setup_logging(log_file='app_log.log', log_level=logging.INFO)

# Libraries and their aliases (None for libraries without an alias)
libraries_with_aliases = {
    'os': None,
    'time': None,
    're': 're',
    'pandas': 'pd',
    'numpy': 'np',
    'matplotlib.pyplot': 'plt',
    'seaborn': 'sns',
    'plotly.express': 'px',
    'plotly.graph_objects': 'go',
    'plotly.io': 'pio'
}

# Log the library imports
logger.info("Initializing logging configuration")
log_imported_libraries(libraries_with_aliases, logger)

2024-09-23 21:44:09,395 - INFO - Initializing logging configuration
2024-09-23 21:44:09,403 - INFO - Importing 'os' without alias
2024-09-23 21:44:09,415 - INFO - Importing 'time' without alias
2024-09-23 21:44:09,423 - INFO - Importing 're' as 're'
2024-09-23 21:44:09,435 - INFO - Importing 'pandas' as 'pd'
2024-09-23 21:44:09,446 - INFO - Importing 'numpy' as 'np'
2024-09-23 21:44:09,456 - INFO - Importing 'matplotlib.pyplot' as 'plt'
2024-09-23 21:44:09,465 - INFO - Importing 'seaborn' as 'sns'
2024-09-23 21:44:09,477 - INFO - Importing 'plotly.express' as 'px'
2024-09-23 21:44:09,496 - INFO - Importing 'plotly.graph_objects' as 'go'
2024-09-23 21:44:09,507 - INFO - Importing 'plotly.io' as 'pio'


In [151]:
def scan_directories(root_dir):
    logging.info(f'Starting directory scan of {root_dir}')
    data = []
    total_files = 0  # To keep track of total files processed

    # Walk through the directory
    for dirpath, dirnames, filenames in os.walk(root_dir):
        #logging.info(f'Scanning directory: {dirpath}')
        for filename in filenames:
            try:
                file_path = os.path.join(dirpath, filename)
                file_stats = os.stat(file_path)
                
                # Collect file metadata
                file_size = file_stats.st_size
                metadata_change_time = datetime.fromtimestamp(file_stats.st_ctime)  # Metadata change time
                modification_time = datetime.fromtimestamp(file_stats.st_mtime)  # Last modified time
                access_time = datetime.fromtimestamp(file_stats.st_atime)  # Last access time

                # Append metadata to the list
                data.append([dirpath, filename, file_path, file_size, metadata_change_time, modification_time, access_time])
                total_files += 1

            except Exception as e:
                logging.error(f'Error processing file {filename} in directory {dirpath}: {e}')

    # Convert to a pandas DataFrame with appropriate columns
    df = pd.DataFrame(data, columns=[
        'directory', 'filename', 'full_path', 
        'file_size', 'metadata_change_time', 'modification_time', 'access_time'])

    logging.info(f'Scanning completed. Total files processed: {total_files}')
    
    return df

# Set the root directory to your project path
root_dir = '/home/tron/git/light_site'

# Get the DataFrame with metadata
meta_df = scan_directories(root_dir)

# Preview the DataFrame
meta_df.head(10)

2024-09-23 21:44:09,629 - INFO - Starting directory scan of /home/tron/git/light_site
2024-09-23 21:44:09,801 - INFO - Scanning completed. Total files processed: 1891


Unnamed: 0,directory,filename,full_path,file_size,metadata_change_time,modification_time,access_time
0,/home/tron/git/light_site,.env,/home/tron/git/light_site/.env,239,2024-09-21 00:21:41.665248,2024-09-21 00:21:41.665248,2024-09-23 14:32:37.800922
1,/home/tron/git/light_site,Pipfile.lock,/home/tron/git/light_site/Pipfile.lock,198611,2024-09-23 18:06:37.124429,2024-09-23 18:06:37.124429,2024-09-23 18:06:37.125429
2,/home/tron/git/light_site,Pipfile,/home/tron/git/light_site/Pipfile,790,2024-09-23 18:06:56.138958,2024-09-23 18:06:56.138958,2024-09-23 18:04:08.194140
3,/home/tron/git/light_site,structure_processed.pkl,/home/tron/git/light_site/structure_processed.pkl,2393012,2024-09-21 14:57:52.673820,2024-09-21 14:57:52.673820,2024-09-23 13:48:30.145471
4,/home/tron/git/light_site,package-lock.json,/home/tron/git/light_site/package-lock.json,88388,2024-09-15 11:09:13.627961,2024-09-11 12:19:51.437347,2024-09-16 09:38:05.473607
5,/home/tron/git/light_site,package.json,/home/tron/git/light_site/package.json,451,2024-09-15 11:09:13.629961,2024-09-11 11:08:32.634561,2024-09-22 10:26:32.473087
6,/home/tron/git/light_site,app_log.log,/home/tron/git/light_site/app_log.log,3518119,2024-09-21 18:07:07.504571,2024-09-21 18:07:07.504571,2024-09-19 01:30:39.510984
7,/home/tron/git/light_site,.env.swp,/home/tron/git/light_site/.env.swp,12288,2024-09-15 11:09:12.454967,2024-08-31 03:40:05.350710,2024-09-15 13:17:57.751385
8,/home/tron/git/light_site,manage.py,/home/tron/git/light_site/manage.py,666,2024-09-15 11:09:12.454967,2024-08-31 22:39:45.525605,2024-09-21 00:09:30.265023
9,/home/tron/git/light_site,requirements.txt,/home/tron/git/light_site/requirements.txt,672,2024-09-15 11:09:12.465967,2024-08-31 22:48:10.552679,2024-09-22 10:26:32.472087


In [152]:
def feature_engineer(meta_df):
    logging.info('Starting feature engineering on the dataset')

    # Feature 1: File age in days
    try:
        logging.info('Applying Feature 1: File age in days based on modification time')
        meta_df['file_age_in_days'] = (datetime.now() - meta_df['modification_time']).dt.days
    except Exception as e:
        logging.error(f'Error while calculating file age in days: {e}')
    
    # Feature 2: Metadata change age in days
    try:
        logging.info('Applying Feature 2: Metadata change age in days')
        meta_df['meta_data_age_in_days'] = (datetime.now() - meta_df['metadata_change_time']).dt.days
    except Exception as e:
        logging.error(f'Error while calculating metadata change age in days: {e}')

    # Feature 3: Recently accessed (less than 3 days)
    try:
        logging.info('Applying Feature 3: Recently accessed files (less than 3 days)')
        meta_df['recently_accessed'] = (datetime.now() - meta_df['access_time']).dt.days < 3
    except Exception as e:
        logging.error(f'Error while determining recently accessed files: {e}')

    # Function to categorize file size
    def categorize_file_size(size):
        try:
            if size < 10**4:  # less than 10KB
                return 'Small'
            elif size < 10**6:  # less than 1MB
                return 'Medium'
            else:
                return 'Large'
        except Exception as e:
            logging.error(f'Error while categorizing file size: {e}')
            return 'Unknown'

    # Feature 4: File size category
    try:
        logging.info('Applying Feature 4: Categorizing file sizes')
        meta_df['file_size_category'] = meta_df['file_size'].apply(categorize_file_size)
    except Exception as e:
        logging.error(f'Error while applying file size categorization: {e}')
    
    # Feature 5: File extension
    try:
        logging.info('Applying Feature 5: Extracting file extensions')
        def extract_file_extension(filename):
            # Extract file extension (without dot) or return 'no_extension' if none is found
            file_extension = os.path.splitext(filename)[1][1:]  # Get the extension without the dot
            if file_extension == '':  # If no extension is found
                return 'no_extension'
            return file_extension
        meta_df['file_extension'] = meta_df['filename'].apply(extract_file_extension)
    except Exception as e:
        logging.error(f'Error while extracting file extensions: {e}')

    # Feature 6: Is Python file
    try:
        logging.info('Applying Feature 6: Identifying Python files')
        meta_df['python_file'] = meta_df['file_extension'] == 'py'
    except Exception as e:
        logging.error(f'Error while identifying Python files: {e}')

    # Feature 7: File depth level by counting slashes in 'full_path'
    try:
        logging.info("Applying Feature 7: Calculating file depth level based on 'full_path'")
        
        # Function to calculate the depth level by counting slashes
        def calculate_depth(full_path):
            return full_path.count('/')  # Count the number of slashes
        
        # Apply the function to the 'full_path' column to create a new 'file_depth' column
        meta_df['file_depth'] = meta_df['full_path'].apply(calculate_depth).astype(int)
    
    except Exception as e:
        logging.error(f"Error while calculating file depth level: {e}")

    logging.info('Feature engineering completed successfully')
    
    return meta_df


# Apply the feature engineering function to meta_df
meta_df = feature_engineer(meta_df)

2024-09-23 21:44:09,905 - INFO - Starting feature engineering on the dataset
2024-09-23 21:44:09,918 - INFO - Applying Feature 1: File age in days based on modification time
2024-09-23 21:44:09,932 - INFO - Applying Feature 2: Metadata change age in days
2024-09-23 21:44:09,944 - INFO - Applying Feature 3: Recently accessed files (less than 3 days)
2024-09-23 21:44:09,957 - INFO - Applying Feature 4: Categorizing file sizes
2024-09-23 21:44:09,972 - INFO - Applying Feature 5: Extracting file extensions
2024-09-23 21:44:09,993 - INFO - Applying Feature 6: Identifying Python files
2024-09-23 21:44:10,004 - INFO - Applying Feature 7: Calculating file depth level based on 'full_path'
2024-09-23 21:44:10,019 - INFO - Feature engineering completed successfully


In [153]:
def classify_django_element(file_path, filename):
    # Combine the directory and filename for classification
    full_path = os.path.join(file_path, filename)
    
    # Project-level configuration files
    if 'manage.py' in full_path:
        return 'Django Management Script'
    elif 'settings.py' in full_path or 'urls.py' in full_path or 'wsgi.py' in full_path or 'asgi.py' in full_path:
        return 'Django Project Configuration'

    # App-level files
    elif 'models.py' in full_path:
        return 'Django App Models'
    elif 'views.py' in full_path:
        return 'Django App Views'
    elif 'admin.py' in full_path:
        return 'Django App Admin'
    elif 'apps.py' in full_path:
        return 'Django App Configuration'
    elif 'tests.py' in full_path:
        return 'Django App Tests'
    
    # Migrations
    elif 'migrations/' in full_path and full_path.endswith('.py'):
        return 'Django Migrations'

    # Static files
    elif 'static/' in full_path:
        if filename.endswith('.css'):
            return 'Static CSS File'
        elif filename.endswith('.js'):
            return 'Static JS File'
        elif filename.endswith(('.jpg', '.png', '.gif', '.svg')):
            return 'Static Image File'
        else:
            return 'Other Static File'

    # Templates
    elif 'templates/' in full_path and filename.endswith('.html'):
        return 'Django Template'

    # Log files
    elif filename.endswith('.log'):
        return 'Log File'

    # Environment or dependency files
    elif filename in ['.env', 'Pipfile', 'Pipfile.lock', 'requirements.txt']:
        return 'Environment/Dependency File'
    
    # Other Python files
    elif filename.endswith('.py'):
        return 'Python Script'

    # Default fallback
    return 'Other'

# Apply the classification function to each row
meta_df['django_element_type'] = meta_df.apply(
    lambda row: classify_django_element(row['directory'], row['filename']), axis=1
)

In [154]:
meta_df.head(10)

Unnamed: 0,directory,filename,full_path,file_size,metadata_change_time,modification_time,access_time,file_age_in_days,meta_data_age_in_days,recently_accessed,file_size_category,file_extension,python_file,file_depth,django_element_type
0,/home/tron/git/light_site,.env,/home/tron/git/light_site/.env,239,2024-09-21 00:21:41.665248,2024-09-21 00:21:41.665248,2024-09-23 14:32:37.800922,2,2,True,Small,no_extension,False,5,Environment/Dependency File
1,/home/tron/git/light_site,Pipfile.lock,/home/tron/git/light_site/Pipfile.lock,198611,2024-09-23 18:06:37.124429,2024-09-23 18:06:37.124429,2024-09-23 18:06:37.125429,0,0,True,Medium,lock,False,5,Environment/Dependency File
2,/home/tron/git/light_site,Pipfile,/home/tron/git/light_site/Pipfile,790,2024-09-23 18:06:56.138958,2024-09-23 18:06:56.138958,2024-09-23 18:04:08.194140,0,0,True,Small,no_extension,False,5,Environment/Dependency File
3,/home/tron/git/light_site,structure_processed.pkl,/home/tron/git/light_site/structure_processed.pkl,2393012,2024-09-21 14:57:52.673820,2024-09-21 14:57:52.673820,2024-09-23 13:48:30.145471,2,2,True,Large,pkl,False,5,Other
4,/home/tron/git/light_site,package-lock.json,/home/tron/git/light_site/package-lock.json,88388,2024-09-15 11:09:13.627961,2024-09-11 12:19:51.437347,2024-09-16 09:38:05.473607,12,8,False,Medium,json,False,5,Other
5,/home/tron/git/light_site,package.json,/home/tron/git/light_site/package.json,451,2024-09-15 11:09:13.629961,2024-09-11 11:08:32.634561,2024-09-22 10:26:32.473087,12,8,True,Small,json,False,5,Other
6,/home/tron/git/light_site,app_log.log,/home/tron/git/light_site/app_log.log,3518119,2024-09-21 18:07:07.504571,2024-09-21 18:07:07.504571,2024-09-19 01:30:39.510984,2,2,False,Large,log,False,5,Log File
7,/home/tron/git/light_site,.env.swp,/home/tron/git/light_site/.env.swp,12288,2024-09-15 11:09:12.454967,2024-08-31 03:40:05.350710,2024-09-15 13:17:57.751385,23,8,False,Medium,swp,False,5,Other
8,/home/tron/git/light_site,manage.py,/home/tron/git/light_site/manage.py,666,2024-09-15 11:09:12.454967,2024-08-31 22:39:45.525605,2024-09-21 00:09:30.265023,22,8,True,Small,py,True,5,Django Management Script
9,/home/tron/git/light_site,requirements.txt,/home/tron/git/light_site/requirements.txt,672,2024-09-15 11:09:12.465967,2024-08-31 22:48:10.552679,2024-09-22 10:26:32.472087,22,8,True,Small,txt,False,5,Environment/Dependency File


In [159]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def create_django_project_analysis_plot(meta_df):
    # Create a 3x1 and 1x1 subplot grid
    fig = make_subplots(
        rows=2, cols=3, 
        specs=[[{"colspan": 1}, {}, {}], [{"colspan": 3}, None, None]],  # Top row has 3 plots, bottom row has 1 plot stretched
        subplot_titles=("File Size Distribution", "Metadata Update", 
                        "Django Element Type", "Metadata Age"),
        horizontal_spacing=0.05,
        vertical_spacing=0.05
    )

    # Graph A: Scatter Chart
    fig_a = px.scatter(meta_df, x='file_extension', y='file_depth', color='file_size_category', size='file_depth',
                   labels={'file_extension': 'File Extension', 'file_depth': 'File Depth'}, template='plotly_dark')
    fig_a.update_traces(showlegend=False)
    for trace in fig_a['data']:
        fig.add_trace(trace, row=1, col=1)

    # Graph B: Box Plot
    fig_b = px.box(meta_df, x='file_extension', y='file_depth', color='meta_data_age_in_days',
                   labels={'file_extension': 'File Extension', 'file_depth': 'File Depth'}, template='plotly_dark')
    fig_b.update_traces(showlegend=False)
    for trace in fig_b['data']:
        fig.add_trace(trace, row=1, col=2)

    # Graph C: Violin Plot
    fig_c = px.violin(meta_df, x='file_extension', y='file_depth', color='django_element_type',
                   labels={'file_extension': 'File Extension', 'file_depth': 'File Depth'}, template='plotly_dark')
    fig_c.update_traces(showlegend=False)
    for trace in fig_c['data']:
        fig.add_trace(trace, row=1, col=3)

    # Graph D: Scatter Plot (spans the entire width at the base)
    fig_d = px.timeline(meta_df, x_start='modification_time', x_end='metadata_change_time', y='file_extension', color='file_size_category',
                       labels={'metadata_change_time': 'Metadata Change', 'access_time': 'Last Accessed'}, template='plotly_dark')
    fig_d.update_traces(showlegend=False)
    for trace in fig_d['data']:
        fig.add_trace(trace, row=2, col=1)

    # Update layout for the entire figure with black background
    fig.update_layout(
        title="Django Project Analysis",
        template='plotly_dark',
        width=1200, height=800,
        plot_bgcolor='black',  # Plot background color
        paper_bgcolor='black',  # Entire figure background color
        font_color='white'  # Font color for text elements
    )

    return fig

In [160]:
fig.show()