In [5]:
import pandas as pd
import numpy as np

def load_dataset(file_path):
    try:
        data = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        return data
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

def calculate_volume(data):
    rows, columns = data.shape
    print("\n--- Volume ---")
    print(f"Number of Rows: {rows}")
    print(f"Number of Columns: {columns}")
    return rows, columns

def analyse_variety(data):
    print("\n--- Variety ---")
    data_types = data.dtypes
    unique_counts = data.nunique()
    for column in data.columns:
        print(f"Column: {column}, Data Type: {data_types[column]}, Unique Values: {unique_counts[column]}")
    return data_types, unique_counts

def evaluate_velocity(data):
    print("\n--- Velocity ---")
    if 'timestamp' in data.columns:
        data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce')
        time_differences = data['timestamp'].diff().dropna()
        avg_velocity = time_differences.mean()
        print(f"Average Time Difference Between Records: {avg_velocity}")
        return avg_velocity
    else:
        print("No timestamp data available to evaluate velocity.")
        return None

def assess_veracity(data):
    print("\n--- Veracity ---")
    missing_values = data.isnull().sum()
    print("Missing Values per Column:")
    print(missing_values)
    inconsistencies = data.apply(lambda x: x.str.contains(r'[\\?*]', na=False).sum() if x.dtypes == 'object' else 0)
    print("Potential Inconsistencies (e.g., special characters):")
    print(inconsistencies)
    return missing_values, inconsistencies

def determine_value(data):
    print("\n--- Value ---")
    print("Descriptive Statistics:")
    descriptive_stats = data.describe(include='all')
    print(descriptive_stats)
    return descriptive_stats

def analyze_dataset(file_path):
    data = load_dataset(file_path)
    if data is not None:
        print("\n--- 5 V's Analysis ---")

        calculate_volume(data)

        analyse_variety(data)

        evaluate_velocity(data)

        assess_veracity(data)

        determine_value(data)

file_path = "student-por.csv"
analyze_dataset(file_path)






Dataset loaded successfully!

--- 5 V's Analysis ---

--- Volume ---
Number of Rows: 649
Number of Columns: 33

--- Variety ---
Column: school, Data Type: object, Unique Values: 2
Column: sex, Data Type: object, Unique Values: 2
Column: age, Data Type: int64, Unique Values: 8
Column: address, Data Type: object, Unique Values: 2
Column: famsize, Data Type: object, Unique Values: 2
Column: Pstatus, Data Type: object, Unique Values: 2
Column: Medu, Data Type: int64, Unique Values: 5
Column: Fedu, Data Type: int64, Unique Values: 5
Column: Mjob, Data Type: object, Unique Values: 5
Column: Fjob, Data Type: object, Unique Values: 5
Column: reason, Data Type: object, Unique Values: 4
Column: guardian, Data Type: object, Unique Values: 3
Column: traveltime, Data Type: int64, Unique Values: 4
Column: studytime, Data Type: int64, Unique Values: 4
Column: failures, Data Type: int64, Unique Values: 4
Column: schoolsup, Data Type: object, Unique Values: 2
Column: famsup, Data Type: object, Unique V