In [1]:
import pandas as pd

df = pd.read_csv('C:\\Users\\vivek\\qubryx\\python\\data.csv')
df

Unnamed: 0,MODEL,COMPANY,CC,TORQUE,MILEAGE,PRICE,TYPE
0,Himalayan 450,Royal Enfield,411,32.0 Nm,30,3.06 - 3.20,Adventure
1,390 Adventure,KTM,373,37.0,25,3.68,adventure
2,G 310 GS,BMW,313,28.0Nm,30,3.3,Adventure
3,v-strom sx,Suzuki,399,37.0,29,2.31,Adventure
4,Adventure,Yezdi,334,27.0,35,2.3,Adventure
...,...,...,...,...,...,...,...
95,Apace RTR 310,TVS,312,27.3,30,2.21,Sports Naked
96,G 310 GS,BMW,Not Available,28.0,30,,Adventure
97,Unicorn,Honda,162 CC,?,60,1.12,Commuter
98,Jawa 42 Bobber,Jawa,,,30,1.95,Bobber


In [5]:
import pandas as pd
import numpy as np

def validity(df):
    total_rows = len(df)
    
    def is_str(val):
        return isinstance(val, str)
    
    def is_float_and_threshold(val, threshold):
        try:
            val_float = float(val)
            return val_float >= threshold
        except:
            return False
    
    validity_results = {}
    
    for col in ['MODEL', 'COMPANY', 'TYPE']:
        if col in df.columns:
            valid_count = df[col].apply(is_str).sum()
            validity_results[col] = (valid_count / total_rows) * 100
        else:
            validity_results[col] = None
    
    checks = {
        'CC': 200,
        'PRICE': 1.50,
        'MILEAGE': 20,
        'TORQUE': 26
    }
    
    for col, threshold in checks.items():
        if col in df.columns:
            valid_count = df[col].apply(lambda x: is_float_and_threshold(x, threshold)).sum()
            validity_results[col] = (valid_count / total_rows) * 100
        else:
            validity_results[col] = None
    
    return pd.Series(validity_results, name='Validity %')

def data_quality(df):
    total_count = len(df)
    data = pd.DataFrame(index=df.columns)
    
    data['missing value count'] = df.isnull().sum()
    
    data['completeness %'] = ((total_count - data['missing value count']) / total_count) * 100
    
    data['unique count'] = df.nunique()
    
    data['uniqueness %'] = (data['unique count'] / total_count) * 100
    
    data['validity %'] = validity(df)
    
    data['Quality Score %'] = data[['completeness %', 'validity %', 'uniqueness %']].mean(axis=1)
    
    return data

quality_metrics = data_quality(df)
print(quality_metrics)


         missing value count  completeness %  unique count  uniqueness %  \
MODEL                      0           100.0            73          73.0   
COMPANY                    0           100.0            26          26.0   
CC                        12            88.0            35          35.0   
TORQUE                    13            87.0            35          35.0   
MILEAGE                   12            88.0            26          26.0   
PRICE                      7            93.0            51          51.0   
TYPE                       0           100.0            33          33.0   

         validity %  Quality Score %  
MODEL         100.0        91.000000  
COMPANY       100.0        75.333333  
CC             55.0        59.333333  
TORQUE         43.0        55.000000  
MILEAGE        71.0        61.666667  
PRICE          51.0        65.000000  
TYPE          100.0        77.666667  
