In [None]:
import re
import pandas as pd


FILE_PATH = "..\\kindle_reviews.csv"
TEXT_COLS = ["reviewText", "summary"]
COLS_TO_DROP = ["reviewerName", "reviewTime"]
df = pd.read_csv(FILE_PATH, nrows=1e5, index_col=0)


def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text


def transform_data(df):

    helpful = df["helpful"].str.extract(r"\[(\d+), (\d+)\]").astype(int)
    df["helpful"] = (helpful[0] / helpful[1]).fillna(0)
    df["review_datetime"] = pd.to_datetime(df["unixReviewTime"], unit='s')
    df['days_since_review'] = (pd.to_datetime('now') - df['review_datetime']).dt.days

    for col in TEXT_COLS:
        df[col+'_isna'] = df[col].isna()
        df[col] = df[col].fillna('')
        df[col+'_len'] = df[col].str.len()
        df[col+'_clean'] = df[col].apply(clean_text)
    df.drop(columns=COLS_TO_DROP+TEXT_COLS, inplace=True)

    df.columns = [re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() for name in df.columns]

    return df


df_transformed = transform_data(df.copy())

print("\nРезультат после трансформации:")
display(df_transformed.head())

print("\nТипы данных после трансформации:")
df_transformed.info()


Результат после трансформации:


Unnamed: 0,asin,helpful,overall,reviewer_i_d,unix_review_time,review_datetime,days_since_review,review_text_isna,review_text_len,review_text_clean,summary_isna,summary_len,summary_clean
0,B000F83SZQ,0.0,5,A1F6404F1VG29J,1399248000,2014-05-05,4108,False,294,i enjoy vintage books and movies so i enjoyed ...,False,18,nice vintage story
1,B000F83SZQ,1.0,4,AN0N05A9LIJEQ,1388966400,2014-01-06,4227,False,455,this book is a reissue of an old one the autho...,False,12,different
2,B000F83SZQ,1.0,4,A795DMNCJILA6,1396569600,2014-04-04,4139,False,375,this was a fairly interesting read it had old...,False,5,oldie
3,B000F83SZQ,1.0,5,A1FV0SX13TWVXQ,1392768000,2014-02-19,4183,False,101,id never read any of the amy brewster mysterie...,False,18,i really liked it
4,B000F83SZQ,0.0,4,A3SPTOKDG7WBLN,1395187200,2014-03-19,4155,False,130,if you like period pieces clothing lingo you ...,False,14,period mystery



Типы данных после трансформации:
<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   asin               100000 non-null  object        
 1   helpful            100000 non-null  float64       
 2   overall            100000 non-null  int64         
 3   reviewer_i_d       100000 non-null  object        
 4   unix_review_time   100000 non-null  int64         
 5   review_datetime    100000 non-null  datetime64[ns]
 6   days_since_review  100000 non-null  int64         
 7   review_text_isna   100000 non-null  bool          
 8   review_text_len    100000 non-null  int64         
 9   review_text_clean  100000 non-null  object        
 10  summary_isna       100000 non-null  bool          
 11  summary_len        100000 non-null  int64         
 12  summary_clean      100000 non-null  object        
dtypes: bool(2), date