In [3]:
import pandas as pd

df = pd.read_csv('/Users/QubryxVivek/QUB/python/data.csv')
df

Unnamed: 0,MODEL,COMPANY,CC,TORQUE,MILEAGE,PRICE,TYPE
0,Himalayan 450,Royal Enfield,411,32.0 Nm,30,3.06 - 3.20,Adventure
1,390 Adventure,KTM,373,37.0,25,3.68,adventure
2,G 310 GS,BMW,313,28.0Nm,30,3.3,Adventure
3,v-strom sx,Suzuki,399,37.0,29,2.31,Adventure
4,Adventure,Yezdi,334,27.0,35,2.3,Adventure
...,...,...,...,...,...,...,...
95,Apace RTR 310,TVS,312,27.3,30,2.21,Sports Naked
96,G 310 GS,BMW,Not Available,28.0,30,,Adventure
97,Unicorn,Honda,162 CC,?,60,1.12,Commuter
98,Jawa 42 Bobber,Jawa,,,30,1.95,Bobber


In [4]:
import pandas as pd
import numpy as np

df_v = pd.read_csv('/Users/QubryxVivek/QUB/python/top_100_bikes.csv')

def validity(df):
    total_rows = len(df)
    
    def is_str(val):
        return isinstance(val, str)
    
    def in_range(val, low, high):
        try:
            v = float(val)
            return low<=v<=high
        except:
            return False
    
    valid_types = {
        'Street/Naked',
        'Sports',
        'Adventure',
        'Cruiser',
        'Commuter',
        'Scambler',
        'Electric Sports'
    }

    vresults = {}
    
    for col in ['MODEL', 'COMPANY', 'TYPE']:
        if col in df.columns:
            valid_count = df[col].apply(is_str).sum()
            vresults[col] = (valid_count / total_rows) * 100
        else:
            vresults[col] = None

    if 'TYPE' in df.columns:
        valid_type_count = df['TYPE'].apply(lambda x: x.strip() in valid_types).sum()
        vresults['TYPE'] = (valid_type_count / total_rows) * 100
    else:
        vresults['TYPE'] = None

    ranges = {
        'CC': (150, 648),
        'PRICE': (1.50, float('inf')),
        'MILEAGE': (20, float('inf')),
        'TORQUE': (19, 52),
    }
    
    for col, (low, high) in ranges.items():
        if col in df.columns:
            vcount = df[col].apply(lambda x: in_range(x, low, high)).sum()
            vresults[col] = (vcount / total_rows) * 100
        else:
            vresults[col] = None
    
    return pd.Series(vresults, name='Validity %')

def accuracy(df, df_v,col_to_check):
    total_rows = len(df)
    aresults = {}

    for col in col_to_check:
        if col in  df.columns and col in df_v.columns:
            acount = (df[col] == df_v[col]).sum()
            aresults[col] = (acount / total_rows) * 100
        else:
            aresults[col] = None
    return pd.Series(aresults, name='accuracy %')

def data_quality(df, df_v):
    total_count = len(df)
    data = pd.DataFrame(index=df.columns)

    data['data_type'] = df.dtypes
    
    data['missing value count'] = df.isnull().sum()
    
    data['completeness %'] = ((total_count - data['missing value count']) / total_count) * 100
    
    data['unique count'] = df.nunique()
    
    data['uniqueness %'] = (data['unique count'] / total_count) * 100
    
    data['validity %'] = validity(df)

    if df_v is not None:
        col_to_check = ['MODEL','COMPANY','CC','TORQUE','PRICE','TYPE']
        data['accuracy %'] = accuracy(df,df_v,col_to_check)
    else:
        data['accuracy %'] = np.nan
    
    data['Quality Score %'] = data[['completeness %', 'validity %', 'accuracy %']].mean(axis=1)
    
    return data

quality_metrics = data_quality(df, df_v)
print(quality_metrics)


        data_type  missing value count  completeness %  unique count  \
MODEL      object                    0           100.0            73   
COMPANY    object                    0           100.0            26   
CC         object                   12            88.0            35   
TORQUE     object                   13            87.0            35   
MILEAGE    object                   12            88.0            26   
PRICE      object                    7            93.0            51   
TYPE       object                    0           100.0            33   

         uniqueness %  validity %  accuracy %  Quality Score %  
MODEL            73.0       100.0       100.0       100.000000  
COMPANY          26.0       100.0       100.0       100.000000  
CC               35.0        70.0         0.0        52.666667  
TORQUE           35.0        50.0         0.0        45.666667  
MILEAGE          26.0        71.0         NaN        79.500000  
PRICE            51.0        51.0

**TABLE LEVEL QUALITY METRICS**

In [None]:
import pandas as pd
import numpy as np

def table_data_quality(df, df_v):
    col_data = data_quality(df, df_v)

    total_rows = len(df)

    total_columns = len(df.columns)

    total_cells = total_rows* total_columns

    table_metrics = {}

    total_missing = col_data['missing value count'].sum()
    table_metrics['Overall Missing Values %'] = ( total_missing / total_cells) * 100  

    total_non_missing = total_cells - total_missing
    table_metrics['Overall Completeness %'] = (total_non_missing / total_cells) * 100

    total_unique = col_data['unique count'].sum()

    table_metrics['Overall Uniqueness %'] = (total_unique / total_cells) * 100

    validity_values = col_data['validity %'].dropna()
    table_metrics['Overall Validity %'] = validity_values.mean() 

    accuracy_values = col_data['accuracy %'].dropna()
    table_metrics['Overall Accuracy %'] = accuracy_values.mean() 

    valid_quality_scores = col_data['Quality Score %'].dropna()
    table_metrics['Overall Quality Score %'] = valid_quality_scores.mean()

    return pd.Series(table_metrics)
table_quality = table_data_quality(df, df_v)
print(table_quality)




Overall Missing Values %     6.285714
Overall Completeness %      93.714286
Overall Uniqueness %        39.857143
Overall Validity %          71.285714
Overall Accuracy %          50.000000
Overall Quality Score %     73.071429
dtype: float64
