In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def process_numeric(df):
  ''' Receives a pandas DataFrame's numerical columns and returns a DataFrame preprocessed'''
  # dealing with NaN values
  knn = KNNImputer()
  df = pd.DataFrame(knn.fit_transform(df), columns=df.columns)

  return df

In [3]:
def categoricDefiner(df):
  '''Returns the DataFrame only with its object categorical columns'''
  cat = [col for col in df.columns if (df[col].nunique()/len(df))*100 < 80] 
  
  return df[cat] 

In [4]:
def process_categorical(df):
  ''' Receives a pandas DataFrame's categorical columns and returns a DataFrame preprocessed'''
  df = df.fillna("None")
  ohe = OneHotEncoder(handle_unknown='ignore')
  df = pd.DataFrame(ohe.fit_transform(df).todense())

  return df

In [5]:
def semanticalDefiner(df):
  '''Returns the DataFrame only with its object semantical columns'''
  sem = [col for col in df.columns if (df[col].nunique()/len(df))*100 >= 80] 
  
  return df[sem]

In [6]:

def process_semantical(df):
  ''' Receives a pandas DataFrame's semantical columns and returns a DataFrame preprocessed'''
  df = df.fillna("None")
  vec = TfidfVectorizer(max_features=2000)

  df2 = pd.DataFrame()
  for col in df.columns:
    df2 = pd.concat([df2,pd.DataFrame(vec.fit_transform(df[col]).todense())], axis=1)

  return df2

In [7]:
def process_df(df):
  ''' Receives a pandas DataFrame and returns a DataFrame preprocessed'''
  df2 = df.copy()
  df_num = pd.DataFrame()
  df_sem = pd.DataFrame()
  df_cat = pd.DataFrame()
  obj_cols = [col for col in df2.columns if df2[col].dtype == "object"]

  if len(df2[obj_cols]) < df.shape[1]:
    df_num = process_numeric(df2.select_dtypes(exclude="object"))

  if len(df2[obj_cols]) > 0:
    df_cat = process_categorical(categoricDefiner(df2.select_dtypes(include="object")))
    df_sem = process_semantical(semanticalDefiner(df2.select_dtypes(include="object")))

  df2 = pd.concat([df_num, df_cat, df_sem], axis=1)

  # standardizing features
  sca = StandardScaler()
  df2 = pd.DataFrame(sca.fit_transform(df2))

  # reducing the number of columns
  if df2.shape[1] > 100:
    pca = PCA(n_components=100)
    df2 = pd.DataFrame(pca.fit_transform(df2))

  return df2

In [9]:
df_houses = pd.read_csv("Housing Prices.csv")

process_df(df_houses)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-3.989720,1.122519,-0.273248,-1.211710,-1.994189,-2.347313,-0.408662,-0.530326,0.558897,0.458753,...,0.043492,0.167116,-0.576055,0.123806,-0.332996,0.260492,-0.590353,0.720137,0.220130,-0.353097
1,-0.370673,-2.325052,0.430564,0.546835,-0.668568,-0.596953,0.440734,-0.238219,-1.010618,-0.418669,...,-0.468960,-1.642825,1.025474,-0.740878,-1.786680,1.098933,1.482250,-1.617515,0.762533,0.692451
2,-4.768682,0.707758,0.250171,-0.346129,-1.321461,-3.192154,-0.970792,-0.336490,0.299929,-0.522899,...,0.037999,0.283457,-0.562361,0.305329,-0.342507,-0.571765,-0.161194,0.703993,-0.446678,-0.323414
3,1.284514,-0.294505,-1.016508,-0.681369,2.192859,-0.711955,-1.270244,0.740498,-1.202107,0.968803,...,-0.079141,-0.167538,-0.091889,-0.520223,-0.073809,-1.507272,0.123996,0.223793,-0.180411,-0.084654
4,-5.115230,0.566319,0.504898,0.012313,-1.492141,-3.463636,-1.349564,0.257236,0.492375,-1.525915,...,0.083616,1.651802,0.052171,-1.546915,0.440677,-0.396563,-0.263524,-0.862625,-0.343876,0.731118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-2.140232,0.298699,-0.603619,-1.108731,-1.451711,-2.334684,-0.922788,-0.049431,-0.552041,0.205880,...,0.161887,-0.920320,-0.396658,-0.358556,-0.305794,0.384822,-0.369764,-0.554540,0.075176,0.351347
1456,1.072218,-5.014554,1.507630,0.435578,-0.599052,0.599969,0.270633,1.883751,1.047359,0.717981,...,-0.364131,0.020150,-1.228222,-0.533484,-2.105596,-0.692138,0.311853,-0.071691,0.076096,0.726948
1457,-1.032648,0.306930,-0.620701,0.093488,1.580110,1.372433,0.455122,1.722351,-0.760363,-3.239886,...,-0.454943,-0.696874,-1.337652,0.342394,-0.118884,1.766350,-2.050648,0.044527,1.201651,0.225559
1458,1.601145,-2.207722,0.268916,-0.859572,0.514626,1.747471,-0.140997,-1.946319,0.830589,1.359584,...,0.294754,-0.442979,-1.016255,1.027730,1.087021,0.619184,1.538780,1.402519,0.072968,-0.244785


In [10]:
df_twitter = pd.read_csv("Twitter Sentiment.csv")

process_df(df_twitter)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-2.663352,-0.680896,0.987161,3.485804,-0.575981,0.784885,0.466299,-1.198978,0.095473,-0.419408,...,0.068167,0.635533,0.448895,0.541371,-0.932537,-0.233232,-0.763891,-0.252193,0.122439,1.092357
1,2.347847,-1.661458,-2.068812,-0.489069,1.087452,1.275347,0.968342,-0.242423,-0.283257,0.223235,...,0.485115,-1.340853,-0.731443,-0.437903,-0.023061,0.901500,-0.848929,1.054238,-0.532150,-1.234817
2,0.412756,-1.019367,-1.634687,-0.081979,0.582534,1.256364,-1.480756,2.367443,0.597713,1.627198,...,1.786428,0.473301,-0.976973,0.752840,0.593829,-0.434911,-0.742178,-0.792945,-1.945885,-0.453115
3,0.469565,-1.319011,-0.886179,-0.117864,0.116672,2.689480,-1.501184,-1.339512,1.490536,0.915961,...,0.333316,-3.829032,-1.128287,-0.781299,-0.482818,2.541401,-1.850318,0.269292,-0.699194,-1.175115
4,-1.067725,-0.264568,-0.642533,-0.046703,0.019883,-2.812942,-0.012106,3.116322,-0.786539,-0.595262,...,0.454533,0.704802,-0.671450,0.341003,0.461127,0.322911,-1.152263,1.213589,-0.286958,-0.044986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27476,0.999517,-1.582725,-1.267486,-0.145157,0.943309,1.086595,-0.578054,-0.250954,-0.365153,1.198247,...,1.412616,0.142844,0.928298,1.980087,-0.739776,3.048769,-0.202576,2.842376,0.118195,-0.693603
27477,-0.492780,-0.944961,-1.036108,1.493443,0.174397,1.259970,-0.997943,1.055105,0.163434,-0.676298,...,0.344973,0.401546,1.128535,1.555291,0.947339,-0.463716,-1.194140,0.658747,-0.359127,-0.027000
27478,1.014375,0.007522,0.545884,1.721099,-1.557681,-2.256605,1.382881,-1.137062,-1.347626,-1.654256,...,-2.050109,0.601287,0.721569,-0.357025,-1.062926,-0.971440,0.279110,0.080844,-1.389230,0.339637
27479,0.467790,-1.279133,-0.441193,2.057864,1.032928,-1.679515,0.770206,-0.399878,-0.632842,-0.368357,...,-0.859881,0.281770,-0.065213,-0.022383,-1.223495,-0.218951,-2.779308,-0.391721,-1.651744,0.224260
