In [1]:
import os
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
from string import punctuation

## train_orig/test_orig

In [2]:
train_orig = pd.read_csv("../input/train.csv")
test_orig = pd.read_csv("../input/test.csv")
train_orig.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
print(train_orig.isnull().sum())
print(test_orig.isnull().sum())

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64
test_id      0
question1    2
question2    4
dtype: int64


### 空值填补

In [4]:
train_orig = train_orig.fillna(" ")
test_orig = test_orig.fillna(" ")

### 小写转化/去除标点/缩写转化

In [5]:
def common_words_transformation_remove_punctuation(text):
    
    text = text.lower()
    
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"when's", "when is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"there's", "there is", text)

    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"\'s", " ", text)  # 除了上面的特殊情况外，“\'s”只能表示所有格，应替换成“ ”
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r" uk ", " england ", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text)
    text = re.sub(r" ds ", " data science ", text)
    text = re.sub(r" ee ", " electronic engineering ", text)
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iphone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the us", "america", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"\0s", "0", text)
    
    text = "".join([c for c in text if c not in punctuation])
        
    return text

train_orig["question1"] = train_orig["question1"].apply(common_words_transformation_remove_punctuation)
train_orig["question2"] = train_orig["question2"].apply(common_words_transformation_remove_punctuation)
test_orig["question1"] = test_orig["question1"].apply(common_words_transformation_remove_punctuation)
test_orig["question2"] = test_orig["question2"].apply(common_words_transformation_remove_punctuation)
train_orig.to_csv("train_orig.csv", index = False)
test_orig.to_csv("test_orig.csv", index = False)
train_orig.head()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor koh i noor diamond,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is d...,0
4,4,9,10,which one dissolve in water quickly sugar sal...,which fish would survive in salt water,0


## train_stop/test_stop

In [6]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### 清理停用词/令牌化

In [7]:
def remove_stopwords(text):
    stops = set(stopwords.words("english"))
    text = word_tokenize(text)
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

train_stop, test_stop = train_orig.copy(deep = True), test_orig.copy(deep = True)
train_stop["question1"] = train_stop["question1"].apply(remove_stopwords)
train_stop["question2"] = train_stop["question2"].apply(remove_stopwords)
test_stop["question1"] = test_stop["question1"].apply(remove_stopwords)
test_stop["question2"] = test_stop["question2"].apply(remove_stopwords)
train_stop.to_csv("train_stop.csv", index = False)
test_stop.to_csv("test_stop.csv", index = False)
train_stop.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,step step guide invest share market india,step step guide invest share market,0
1,1,3,4,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...,0
2,2,5,6,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,3,7,8,mentally lonely solve,find remainder math 23 24 math divided 24 23,0
4,4,9,10,one dissolve water quickly sugar salt methane ...,fish would survive salt water,0


## train_stem/test_stem

### 词根提取

In [8]:
def stem_words(text):
    text = word_tokenize(text)
    stemmer = SnowballStemmer("english")
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

train_stem, test_stem = train_stop.copy(deep = True), test_stop.copy(deep = True)
train_stem["question1"] = train_stem["question1"].apply(stem_words)
train_stem["question2"] = train_stem["question2"].apply(stem_words)
test_stem["question1"] = test_stem["question1"].apply(stem_words)
test_stem["question2"] = test_stem["question2"].apply(stem_words)
train_stem.to_csv("train_stem.csv", index = False)
test_stem.to_csv("test_stem.csv", index = False)
train_stem.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,step step guid invest share market india,step step guid invest share market,0
1,1,3,4,stori kohinoor koh noor diamond,would happen indian govern stole kohinoor koh ...,0
2,2,5,6,increas speed internet connect use vpn,internet speed increas hack dns,0
3,3,7,8,mental lone solv,find remaind math 23 24 math divid 24 23,0
4,4,9,10,one dissolv water quick sugar salt methan carb...,fish would surviv salt water,0


## train_lem/test_lem

### 词形还原

In [9]:
def lemmatize_words(text):
    text = word_tokenize(text)
    wordnet_lemmatizer = WordNetLemmatizer()
    lammatized_words = [wordnet_lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lammatized_words)
    return text

train_lem, test_lem = train_stop.copy(deep = True), test_stop.copy(deep = True)
train_lem["question1"] = train_lem["question1"].apply(lemmatize_words)
train_lem["question2"] = train_lem["question2"].apply(lemmatize_words)
test_lem["question1"] = test_lem["question1"].apply(lemmatize_words)
test_lem["question2"] = test_lem["question2"].apply(lemmatize_words)
train_lem.to_csv("train_lem.csv", index = False)
test_lem.to_csv("test_lem.csv", index = False)
train_lem.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,step step guide invest share market india,step step guide invest share market,0
1,1,3,4,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...,0
2,2,5,6,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,3,7,8,mentally lonely solve,find remainder math 23 24 math divided 24 23,0
4,4,9,10,one dissolve water quickly sugar salt methane ...,fish would survive salt water,0
