In [1]:
import pandas as pd
import numpy as np

### 1. Load data

In [2]:
raw_df = pd.read_csv('data/raw/query_train.csv', skiprows=0, sep='|')
test_df = pd.read_csv('data/raw/query_test.csv', skiprows=0, sep='|')

In [3]:
raw_df.columns = ['query_id', 'query', 'label']
test_df.columns = ['query_id', 'query']

In [4]:
raw_df.head()

Unnamed: 0,query_id,query,label
0,1,fertilizer use of paddy crops ....?,1
1,2,Information regarding weather in FATEHABAD?,8
2,3,REGARDING ANIMAL HUSBANDRY?,8
3,4,TELL ME SUBSIDY ON SUGARCANE THRASH CUTTER.,3
4,5,TELL M E ABOUT WHITEGROUB CONTROIL IN PENUT ?,6


In [76]:
test_df.head()

Unnamed: 0,query_id,query
0,1,ASKED ABOUT ATTACK APHIDS ON TUR?
1,2,ASKED ABOUT ATTACK BLIGHT ON SOYBEAN?
2,3,ASKED ABOUT ATTACK FUNGAL ATTACK ON CHILLI?
3,4,ASKED ABOUT ATTACK OF SUCKING PEST ON SOYABEAN ?
4,5,ASKED ABOUT ATTACK RED MITES ON COTTON?


In [29]:
raw_df.count()

query_id    100622
query       100618
label       100622
dtype: int64

As can be observed, there are 4 NaN values in the `query` column.

In [30]:
test_df.count()

query_id         16032
query            16032
cleaned_query    16032
dtype: int64

### 2. Split raw data

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
raw_train_df, raw_valid_df = train_test_split(raw_df, stratify=raw_df['label'], test_size=0.2)

In [7]:
np.corrcoef(raw_train_df.label.value_counts(), raw_valid_df.label.value_counts())

array([[ 1.        ,  0.99999999],
       [ 0.99999999,  1.        ]])

In [23]:
raw_train_df.to_csv('data/processed/raw_train.csv', sep=',', index=False)
raw_valid_df.to_csv('data/processed/raw_valid.csv', sep=',', index=False)

### 3. Add feature columns

In [9]:
df_1 = raw_df

In [10]:
df_1 = df_1[df_1['query'].notnull()]

In [11]:
temp_df = test_df[test_df['query'].notnull()]

In [12]:
assert len(temp_df) == len(test_df)

#### a. Get query length

In [24]:
df_1['len_query'] = df_1.apply(lambda row: len(row.query.split(' ')), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [25]:
df_1.head()

Unnamed: 0,query_id,query,label,len_query
0,2,Information regarding weather in FATEHABAD?,8,5
1,3,REGARDING ANIMAL HUSBANDRY?,8,3
2,4,TELL ME SUBSIDY ON SUGARCANE THRASH CUTTER.,3,7
3,5,TELL M E ABOUT WHITEGROUB CONTROIL IN PENUT ?,6,9
4,6,Information regarding how to control becterial...,6,13


#### b. Clean text

In [13]:
def clean_text(query):
    #filter out punctuation
    import re
    query = re.sub('[^\w\s]','',query)
    
    #split into words
    from nltk.tokenize import word_tokenize
    words = word_tokenize(query)
    
#     #single label for all non-alphabets
#     num_token = '<num>'
#     words = [num_token if word.isnumeric() else word for word in words]
    
    #convert to lower case
    words = [word.lower() for word in words]
    
    return ' '.join(words)

In [14]:
out = clean_text('TELL M E ABOUT WHITEGROUB 12 CONTROIL IN PENUT ?')
print(out)

tell m e about whitegroub 12 controil in penut


In [15]:
df_1['cleaned_query'] = df_1.apply(lambda row: clean_text(row.query), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
test_df['cleaned_query'] = test_df.apply(lambda row: clean_text(row.query), axis=1)

In [17]:
df_1.head()

Unnamed: 0,query_id,query,label,cleaned_query
0,1,fertilizer use of paddy crops ....?,1,fertilizer use of paddy crops
1,2,Information regarding weather in FATEHABAD?,8,information regarding weather in fatehabad
2,3,REGARDING ANIMAL HUSBANDRY?,8,regarding animal husbandry
3,4,TELL ME SUBSIDY ON SUGARCANE THRASH CUTTER.,3,tell me subsidy on sugarcane thrash cutter
4,5,TELL M E ABOUT WHITEGROUB CONTROIL IN PENUT ?,6,tell m e about whitegroub controil in penut


In [18]:
test_df.head()

Unnamed: 0,query_id,query,cleaned_query
0,1,ASKED ABOUT ATTACK APHIDS ON TUR?,asked about attack aphids on tur
1,2,ASKED ABOUT ATTACK BLIGHT ON SOYBEAN?,asked about attack blight on soybean
2,3,ASKED ABOUT ATTACK FUNGAL ATTACK ON CHILLI?,asked about attack fungal attack on chilli
3,4,ASKED ABOUT ATTACK OF SUCKING PEST ON SOYABEAN ?,asked about attack of sucking pest on soyabean
4,5,ASKED ABOUT ATTACK RED MITES ON COTTON?,asked about attack red mites on cotton


#### Save files

In [19]:
processed_train_df, processed_valid_df = train_test_split(df_1[['query_id', 'cleaned_query', 'label']], stratify=df_1['label'], test_size=0.2)

In [20]:
processed_test_df = test_df[['query_id', 'cleaned_query']]

In [43]:
processed_train_df.to_csv('data/processed/processed_train.csv', sep=',', index=False)
processed_valid_df.to_csv('data/processed/processed_valid.csv', sep=',', index=False)
processed_test_df.to_csv('data/processed/processed_test.csv', sep=',', index=False)

In [36]:
df_1.len_query.describe()

count    100617.000000
mean          6.987447
std          11.086267
min           1.000000
25%           5.000000
50%           6.000000
75%           8.000000
max        3118.000000
Name: len_query, dtype: float64

About 75% of queries have more than 5 words.

#### c. Add language labels

In [29]:
from nltk.corpus import words

In [30]:
en_word_set = set(words.words())

In [48]:
def detect_lang(query):
    tokens = query.split(' ')
    
    en_count = 0
    len_query = len(tokens)
    for t in tokens:
        if t in en_word_set:
            en_count += 1
    if float(en_count/len_query) >= 0.5:
        return 'en'
    else:
        return 'not_en'

In [49]:
df_1['lang'] = df_1.apply(lambda row: detect_lang(row.cleaned_query), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [50]:
df_1.head()

Unnamed: 0,query_id,query,label,len_query,cleaned_query,lang
0,2,Information regarding weather in FATEHABAD?,8,5,information regarding weather in fatehabad,en
1,3,REGARDING ANIMAL HUSBANDRY?,8,3,regarding animal husbandry,en
2,4,TELL ME SUBSIDY ON SUGARCANE THRASH CUTTER.,3,7,tell me subsidy on sugarcane thrash cutter,en
3,5,TELL M E ABOUT WHITEGROUB CONTROIL IN PENUT ?,6,9,tell m e about whitegroub controil in penut,en
4,6,Information regarding how to control becterial...,6,13,information regarding how to control becterial...,en


In [51]:
df_1[df_1.lang=='not_en'].head()

Unnamed: 0,query_id,query,label,len_query,cleaned_query,lang
14,16,kya ganna bone ke bad pani lagaa sate hai,0,9,kya ganna bone ke bad pani lagaa sate hai,not_en
23,25,mausam ki jankari ?,0,5,mausam ki jankari,not_en
31,33,DHAN ME DEEMAK LAGA HAI ?,2,6,dhan me deemak laga hai,not_en
32,34,alovera market infarmetion ?,6,4,alovera market infarmetion,not_en
37,39,80 Din ke makka me kiya khad dai,6,8,80 din ke makka me kiya khad dai,not_en


### 4. Word cloud

In [62]:
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 
import pandas as pd 
  
# Reads 'Youtube04-Eminem.csv' file  
df = pd.read_csv('data/raw/query_train.csv', sep='|')

comment_words = ' '
stopwords = set(STOPWORDS) 
  
# iterate through the csv file 
for val in list(df.iloc[:,1]): 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split(' ') 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '
  
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

KeyboardInterrupt: 

In [50]:
df = pd.read_csv('data/raw/query_train.csv', sep='|')

In [66]:
# ! pip install mechanicalsoup
import mechanicalsoup

browser = mechanicalsoup.StatefulBrowser()
browser.open("https://translate.google.com/#auto/en/bindi khana achha hota hai")
print(browser.get_current_page())
browser.launch_browser()

<!DOCTYPE html>
<html><head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="translate, translations, translation, translator, machine translation, online translation" name="keywords"/><meta content="Google's free service instantly translates words, phrases, and web pages between English and over 100 other languages." name="description"/><meta content="noodp" name="robots"/><meta content="notranslate" name="google"/><link href="https://translate.google.com/" rel="canonical"/><title>Google Translate</title><link href="/opensearch.xml?hl=en" rel="search" title="Google Translate" type="application/opensearchdescription+xml"/><script>JS_ERR_COUNT = 0;JS_ERR_ARR = [];JS_LOADED = false;function _gtErr(e,url,line){if (++JS_ERR_COUNT > 10) {return;}var i=new Image();var err='e='+e.substr(0,1500)+',url='+url.substr(0,400)+',line='+line+',count='+JS_ERR_COUNT;JS_ERR_ARR.push(err);i.src='/gen204?jserr='+encodeURIComponent(err);i.onload=function(){i.onload=null;};