In [1]:
import pandas as pd
import os
import numpy as np
import numpy as np
import string
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcthanvancon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcthanvancon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using TensorFlow backend.


In [2]:
data_dir = '../data'
df = pd.read_csv(os.path.join(data_dir,"data_with_gender_rec.csv"))
df = df[['MATRICULEINT', 'TXT', 'GENRE']]
df.loc[:,'GENRE'] = pd.to_numeric(df.loc[:,'GENRE'], errors = 'coerce', downcast = 'integer')
df.dropna(inplace = True)
df = df[df['GENRE'].isin([1, 2])]

In [3]:
"""##Stop words removal"""
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

#Import stop words
language = 'french'
stop_words = set(stopwords.words(language)) 
punctuation = string.punctuation + filters + '\n' #remember to remove utf words

#Row by row tokenization
def tokenization_and_stop_words_out(text):
  x = word_tokenize(text)
  y = [w for w in x if not w in stop_words and not w in punctuation]
  return y

df.loc[:,'TXT'] = df['TXT'].apply(tokenization_and_stop_words_out)

"""## Encoding labels

Here we use to one hot encoding for encoding genders
"""
df = pd.get_dummies(df, columns = ['GENRE'])

In [4]:
"""## Splitting into train and test df"""

#Initial split into train and test dataframes
df_train_init, df_test = train_test_split(df, test_size = 0.25)

#Second split of train dataframe into train and val dataframes
df_train, df_val = train_test_split(df_train_init, test_size = 0.25)

"""## Tokenization"""
#Tokenizer training 
num_words = 10000
len_max_seq = 840

train_values = df_train.loc[:,'TXT'].tolist()

tokenizer = Tokenizer(num_words = num_words, filters= filters,lower =True)
tokenizer.fit_on_texts(df_train['TXT'].tolist())

In [5]:
#Text to sequences
df_train.loc[:,'TXT'] = tokenizer.texts_to_sequences(df_train.loc[:,'TXT'])
df_val.loc[:,'TXT'] = tokenizer.texts_to_sequences(df_val.loc[:,'TXT'])
df_test.loc[:,'TXT'] = tokenizer.texts_to_sequences(df_test.loc[:,'TXT'])
df_val.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,MATRICULEINT,TXT,GENRE_1.0,GENRE_2.0
2697,001-KT-1855708,"[351, 221, 74, 5946, 1, 1809, 73, 1573, 7891, ...",0,1
1442,001-RD-1829703,"[217, 1, 1, 1, 1, 5053, 1, 18, 25, 1, 1, 1, 17...",1,0
5714,001-QO-1826762,"[1, 4151, 1, 328, 26, 1, 394, 16, 5210, 1, 544...",0,1
9197,001-VAL-1814546,"[1170, 1, 1, 633, 25, 1, 60, 1, 60, 1, 1270, 1...",1,0
5347,001-SNS-0006519,"[7010, 1, 918, 26, 1, 339, 33, 7915, 4119, 1, ...",1,0


In [7]:
print(df_val.shape)
print(df_train.shape)
print(df_test.shape)

(2117, 4)
(6349, 4)
(2823, 4)


In [None]:
test = df_train.copy(deep = True)

In [None]:
df.apply(lambda x: len(x['TXT']), axis = 1).max()

In [None]:
#pad_sequences(df_train.loc[:,'TXT'].tolist(), len_max_seq)
test['TXT'] = pad_sequences(df_train.loc[:,'TXT'], len_max_seq).tolist()
filter = test['length'] == test['length'].max()
test[filter]

In [8]:
#Padding sequences
#df_train['TXT'] = 
df_train['TXT'] = pad_sequences(df_train.loc[:,'TXT'], len_max_seq).tolist()
df_val['TXT'] = pad_sequences(df_val.loc[:,'TXT'], len_max_seq).tolist()
df_test['TXT'] = pad_sequences(df_test.loc[:,'TXT'], len_max_seq).tolist()


"""## Export"""
#df_train.to_csv(os.path.join(data_dir, "train.csv"),index = False)
#df_val.to_csv(os.path.join(data_dir, "val.csv"),index = False)
#df_test.to_csv(os.path.join(data_dir, "test.csv"),index = False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


'## Export'

In [9]:
df_val.head()

Unnamed: 0,MATRICULEINT,TXT,GENRE_1.0,GENRE_2.0
2697,001-KT-1855708,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1
1442,001-RD-1829703,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0
5714,001-QO-1826762,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1
9197,001-VAL-1814546,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0
5347,001-SNS-0006519,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0
