In [None]:
import numpy as np

data = np.genfromtxt(
    r"D:\Git_repo\Data-analysis-py\Numpy\students_messy_120k.csv",  
    delimiter=",",
    dtype=str,
    encoding="utf-8",
    skip_header=1
)

print(data.shape)


(121440, 13)


In [3]:
data

array([['115082', 'Fatima Yusuf', '', ..., '2021-05-05 06:00:00',
        'fatima.yusuf1654@school.edu', '07050937730'],
       ['86795', 'Ada Ogundare', 'Male', ..., '22/08/2021 03:00 AM',
        'ada.ogundare1907@school.edu', '08484889590'],
       ['49227', 'Efe Olatunji', 'Female', ..., '2021-07-08 03:00:00',
        'efe.olatunji2306@school.edu', '09828475957'],
       ...,
       ['103695', 'Ngozi Bello', 'Female', ..., '2020-10-29 03:00:00',
        'ngozi.bello5001@school.edu', '07879678058'],
       ['861', 'Amina Olatunji', 'M', ..., '16/07/2019 11:00 PM',
        'amina.olatunji5161@school.edu', '08802892430'],
       ['15796', 'Tunde Adebayo', 'Male', ..., '25/07/2021 03:00 PM',
        'tunde.adebayo7287@mail.com', '09490231820']],
      shape=(121440, 13), dtype='<U32')

In [4]:
data.dtype

dtype('<U32')

In [None]:
np.set_printoptions(threshold=2000, linewidth=160)      # more output 

In [6]:
def to_float(col):
    col = col.astype(object)                 # allow mixing
    col = np.char.strip(col.astype(str))     # trim spaces
    out = np.full(col.shape, np.nan, dtype=float)

    for i, v in enumerate(col):
        if v == "" or v.lower() == "nan":
            continue
        try:
            out[i] = float(v)
        except ValueError:
            # bad strings stay NaN
            pass
    return out

age = to_float(data[:, 3])
math = to_float(data[:, 7])
eng  = to_float(data[:, 8])
att  = to_float(data[:, 9])



In [7]:
bad_age = (age < 10) | (age > 20)
age[bad_age] = np.nan

# fill missing with median
age_med = np.nanmedian(age)
age = np.where(np.isnan(age), age_med, age)


In [8]:
# Clean scores (clamp 0â€“100, fill missing with mean)
def clamp_fill(x, lo=0, hi=100):
    x = np.clip(x, lo, hi)
    mean = np.nanmean(x)
    return np.where(np.isnan(x), mean, x)

math = clamp_fill(math)
eng  = clamp_fill(eng)


In [9]:
att = np.clip(att, 0, 100)
att = np.where(np.isnan(att), np.nanmean(att), att)


In [10]:
years_raw = data[:, 6]

years_raw = np.char.strip(years_raw)
years_raw = np.where(years_raw == "Less than 1 year", "0", years_raw)
years_raw = np.where(years_raw == "More than 50 years", "51", years_raw)
years_raw = np.where(years_raw == "five", "", years_raw)

years = to_float(years_raw)
years = np.where(np.isnan(years), np.nanmedian(years), years)


In [11]:
_, idx = np.unique(data, axis=0, return_index=True)
data_unique = data[np.sort(idx)]
print("before:", data.shape, "after:", data_unique.shape)


before: (121440, 13) after: (120000, 13)


In [12]:
student_id = data[:, 0]
_, idx = np.unique(student_id, return_index=True)
data_unique = data[np.sort(idx)]


In [14]:
clean = data.copy().astype(object)   # safer for mixed assignments

# clean[:, 2] = Gender.astype(str)
clean[:, 3] = age.astype(str)
# clean[:, 4] = class_level.astype(str)
# clean[:, 5] = country.astype(str)    # FIXED
clean[:, 6] = years.astype(str)
clean[:, 7] = math.astype(str)
clean[:, 8] = eng.astype(str)
clean[:, 9] = att.astype(str)



In [18]:
headers = ["StudentID","Name","Gender","Age","ClassLevel","Country","YearsCode",
           "MathScore","EnglishScore","AttendancePct","Date","Email","Phone"]


In [19]:
out = "students_cleaned_numpy.csv"

np.savetxt(
    out,
    clean,
    delimiter=",",
    fmt="%s",
    header=",".join(headers),
    comments=""
)

print("saved:", out)


saved: students_cleaned_numpy.csv
