#### Load toefl dataset clean and convert categorial columns to numberic

In [64]:
import numpy as np
import pandas as pd

#### Read data

In [65]:
df=pd.read_csv('../TOEFL_11/index.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Filename,Prompt,Language,Score Level,total word,distance,Hpoint,xmin,alpha,...,sigma,logtrue,file,pl_s,pl_C,pl_R2,log_d,log_m,log_k,log_R2
0,0,88.txt,P6,KOR,high,385,6.0,8,2.0,2.273182,...,3.02549,0,88.txt,0.626725,27.346485,0.8833,3.478245,0.639538,1.090635,0.869275
1,1,278.txt,P6,DEU,medium,321,11.0,7,3.0,2.421178,...,1.817156,0,278.txt,0.720874,31.641664,0.962596,3.573915,0.757765,1.032666,0.951214
2,2,348.txt,P1,TUR,high,363,8.306624,6,2.0,2.398038,...,1.739209,0,348.txt,0.62147,21.737883,0.965709,3.352892,0.874682,0.849857,0.889519
3,3,666.txt,P2,ZHO,medium,362,4.690416,6,5.0,3.410995,...,0.746383,0,666.txt,0.538645,18.320499,0.924139,2.824981,0.296708,1.441791,0.969115
4,4,733.txt,P6,TEL,medium,344,10.630146,7,6.0,3.124099,...,1.163868,0,733.txt,0.646729,25.998149,0.938335,3.393319,0.638436,1.093227,0.926643


#### Clean data

In [66]:
# drop 'Unnamed: 0' columns as it unnessary and also 'Filename' as it is duplicate by file
df.drop(['Unnamed: 0', 'Filename'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12098 entries, 0 to 12097
Data columns (total 19 columns):
Prompt         12098 non-null object
Language       12098 non-null object
Score Level    12098 non-null object
total word     12098 non-null int64
distance       12098 non-null float64
Hpoint         12098 non-null int64
xmin           12093 non-null float64
alpha          12093 non-null float64
mu             12093 non-null float64
sigma          12093 non-null float64
logtrue        12098 non-null int64
file           12098 non-null object
pl_s           12098 non-null float64
pl_C           12098 non-null float64
pl_R2          12098 non-null float64
log_d          12098 non-null float64
log_m          12098 non-null float64
log_k          12098 non-null float64
log_R2         12098 non-null float64
dtypes: float64(12), int64(3), object(4)
memory usage: 1.8+ MB


#### Handle missing value

In [67]:
df.isna().sum()

Prompt         0
Language       0
Score Level    0
total word     0
distance       0
Hpoint         0
xmin           5
alpha          5
mu             5
sigma          5
logtrue        0
file           0
pl_s           0
pl_C           0
pl_R2          0
log_d          0
log_m          0
log_k          0
log_R2         0
dtype: int64

So xmin, alpha , mu and sigma has missing value. so lets fillna with mean 

In [68]:
df = df.fillna(df.mean())
df.isna().sum()

Prompt         0
Language       0
Score Level    0
total word     0
distance       0
Hpoint         0
xmin           0
alpha          0
mu             0
sigma          0
logtrue        0
file           0
pl_s           0
pl_C           0
pl_R2          0
log_d          0
log_m          0
log_k          0
log_R2         0
dtype: int64

#### Handle category columns

There are three category columns those are Prompt, Language and Score Level . Lets convert this categorical columns to numberic by LabelEncoder

In [69]:
from sklearn.preprocessing import LabelEncoder
prompt_le = LabelEncoder()
df['Prompt']  = prompt_le.fit_transform(df['Prompt'].values)

language_le = LabelEncoder()
df['Language']  = language_le.fit_transform(df['Language'].values)

score_mapping ={
    'low':1,
    'medium':2,
    'high': 3
}

df['Score Level']  = df['Score Level'] .map(score_mapping)
#inv_score_mapping = {v: k for k, v in score_mapping.items()}
df.head()

Unnamed: 0,Prompt,Language,Score Level,total word,distance,Hpoint,xmin,alpha,mu,sigma,logtrue,file,pl_s,pl_C,pl_R2,log_d,log_m,log_k,log_R2
0,5,6,3,385,6.0,8,2.0,2.273182,-10.388887,3.02549,0,88.txt,0.626725,27.346485,0.8833,3.478245,0.639538,1.090635,0.869275
1,5,1,2,321,11.0,7,3.0,2.421178,-2.618379,1.817156,0,278.txt,0.720874,31.641664,0.962596,3.573915,0.757765,1.032666,0.951214
2,0,9,3,363,8.306624,6,2.0,2.398038,-2.825649,1.739209,0,348.txt,0.62147,21.737883,0.965709,3.352892,0.874682,0.849857,0.889519
3,1,10,2,362,4.690416,6,5.0,3.410995,0.835672,0.746383,0,666.txt,0.538645,18.320499,0.924139,2.824981,0.296708,1.441791,0.969115
4,5,8,2,344,10.630146,7,6.0,3.124099,-0.359383,1.163868,0,733.txt,0.646729,25.998149,0.938335,3.393319,0.638436,1.093227,0.926643


Now this data is clean and all the colums are numberic. so will save this data as pocessed_toefl.csv file and later 
we will use this file to feed deep learning model.

In [71]:
df['file'] = [int(fname[:-4]) for fname in df['file']]

In [72]:
df.to_csv('../TOEFL_11/processed_toefl.csv', index=False)