In [1]:
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold

In [2]:
#!pip install hdbscan-0.8.28-cp39-cp39-win_amd64.whl
#!pip install top2vec
#!pip install spacy

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Adding Features

## invistage on existing features

## add

### basics

In [20]:
full_data=[train,test]

for df in full_data:
    df['text_length']=df['text'].apply(len)
    df['sentence_no']=df.text.str.split('.').apply(len)
    df['words_no']=df.text.str.split().apply(len)
    l=df.text.str.lower()
    df['unique_words']=l.str.split().apply(set).apply(len)
    
    # # @ digits /W
    hashtag=df.text.str.findall(r'#\w+')
    df['#']=hashtag
    df['#_no']=hashtag.apply(len)
    for n,token in enumerate(hashtag):
        df.loc[n,'#_len']=np.array([len(i) for i in token]).mean()
    #@
    attag=df.text.str.findall(r'@\w+')
    df['@']=attag
    df['@_no']=attag.apply(len)
    for n,token in enumerate(hashtag):
        df.loc[n,'@_len']=np.array([len(i) for i in token]).mean()
    #digits
    digits=df.text.str.findall('[0-9]')
    df['digit']=digits
    df['digit_no']=hashtag.apply(len)
    #\W
    nondigits=df.text.str.findall(r'\W+')
    df['nondigit']=nondigits.apply(set)
    df['nondigit_unique']=df['nondigit'].apply(len)
    df['nondigit_no']=nondigits.apply(len)
    
    
    

In [5]:
df=train.copy()
spam=df[df['target']==1]
ham=df[df['target']==0]

for i in df.columns[5:]:
    if df[i].dtype!=object:
        print('{}; spam:{},ham:{}'.format(i,spam[i].mean(),ham[i].mean()))

### date

#### function

In [84]:
def monthtonum(shortMonth):
    return {
            'jan': 1,
            'feb': 2,
            'mar': 3,
            'apr': 4,
            'may': 5,
            'jun': 6,
            'jul': 7,
            'aug': 8,
            'sep': 9, 
            'oct': 10,
            'nov': 11,
            'dec': 12
    }[shortMonth]

def date_sorter(df_series,verbose=True):
    import pandas as pd
    import re
    import numpy as np
    
    df=pd.DataFrame(df_series)
    text=df_series.str
    df['raw']=np.nan
    df['year1']=np.nan
    df['year2']=np.nan
    df['month1']=np.nan
    df['month2']=np.nan
    df['day1']=np.nan
    df['day2']=np.nan
    
    if verbose:
        print('text ready')
    #format 1a  yyyy
    #1c ?/?/yy = 1d yy/?/?
    
    format1a=r'\d?\d[-/]\d?\d[-/]\d\d\d\d'
    format1b=r'\d{1,2}[-/]\d{1,2}[/-]\d{2}'
    
    ia=[]
    ib=[]
    
    for n,x in enumerate(text.findall(format1a)):
        if len(x)>0:
            date=re.split(r'\W',x[0])
            if 2023>int(date[2])>1000:
                if int(date[1])>12:
                    month=int(date[0])
                    day=int(date[1])
                if int(date[0])>12:
                    month=int(date[1])
                    day=int(date[0])
                else:
                    month=int(date[0])
                    day=int(date[1])

                year=int(date[2])
            
            df.loc[n,'year1']=year
            df.loc[n,'month1']=month
            df.loc[n,'day1']=day
                
            ia.append(n)
                
    if verbose:
        print('f1a , {} converted'.format(len(ia)))

    for n,x in enumerate(text.findall(format1b)):
        if len(x)>0:
            for m in range(0,len(x)):                           
                date=re.split(r'\W',x[m])
                if int(date[2])>23:
                    year=1900+int(date[2])
                else: year=2000+int(date[2])

                if int(date[1])>12:
                    month=int(date[0])
                    day=int(date[1])
                if int(date[0])>12:
                    month=int(date[1])
                    day=int(date[0])
                else:
                    month=int(date[0])
                    day=int(date[1])

                df.loc[n,'year{}'.format(m+1)]=year
                df.loc[n,'month{}'.format(m+1)]=month
                df.loc[n,'day{}'.format(m+1)]=day
                
            ib.append(n)

    if verbose:
        print('f1a , {} converted'.format(len(ib)))

    format2a=format2a=r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?:\D{,3}|\w+)\D{1,2}\d?\d\D{1}\d\d?\d?\d'
    i2a=[]

    for n,x in enumerate(text.findall(format2a)):
        if len(x)>0:
            MMM=re.findall(r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec',x[0])[0].lower()
            month=monthtonum(MMM)
            date=re.split(' ',x[0])
            year=int(date[-1])
            day=date[-2]

            if year<23:
                year+2000
            if year<100:
                year+1900
                
            df.loc[n,'year1']=int(year)
            df.loc[n,'month1']=int(month)
            df.loc[n,'day1']=int(day)
                
            i2a.append(n)

    if verbose:
        print('f2a , {} converted'.format(len(i2a)))
    
    format2b=r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?:\D{,3}|\w+)\D{1,2}\d?\d(?:\w{2}|\w\w\D)\d\d?\d?\d'
    i2b=[]

    for n,x in enumerate(text.findall(format2b)):
        if len(x)>0:
            MMM=re.findall(r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec',x[0])[0].lower()
            month=monthtonum(MMM)
            
            date=re.split(' ',x[0])
            if len(date)==3:
                year=date[-1]
                day=re.findall(r'\d?\d',date[-2])[0]

            else:
                date=re.split(r'th|st|rd|nd',x[0])
                year=date[-1]
                day=re.findall(r'\d?\d',date[-2])[0]

            df.loc[n,'year1']=int(year)
            df.loc[n,'month1']=int(month)
            df.loc[n,'day1']=int(day)
                
            i2b.append(n)
        
    if verbose:
        print('f2b , {} converted'.format(len(i2b)))

    format3=r'\d?\d\D(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?:\D|\w+\D)\d\d\d\d'
    i3=[]
    for n,x in enumerate(text.findall(format3)):
        if len(x)>0:
            MMM=re.findall(r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec',x[0])[0].lower()
            month=monthtonum(MMM)

            date=re.split(r'[ -/]',x[0])
            if len(date)==3:
                year=date[-1]
                day=date[0]
                
            df.loc[n,'year1']=int(year)
            df.loc[n,'month1']=int(month)
            df.loc[n,'day1']=int(day)
            i3.append(n)
            
    if verbose:
        print('f3 , {} converted'.format(len(i3)))

    #Mar 20
    format4a=r'(?:Jan|Feb|March|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\D\d\d'
    i4a=[]
    for n,x in enumerate(text.findall(format4a)):
        if len(x)>0:
            if np.isnan(df.loc[n,'month1']) & np.isnan(df.loc[n,'day1']):
                MMM=re.findall(r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec',x[0])[0].lower()
                month=monthtonum(MMM)
                
                day=re.findall(r'\d?\d',x[0])[0]
                
                df.loc[n,'month1']=int(month)
                df.loc[n,'day1']=int(day)
                
                i4a.append(n)
    if verbose:
        print('f4a , {} converted'.format(len(i4a)))
        
    format4b=r'\d?\d\D(?:Jan|Feb|March|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
    i4b=[]
    for n,x in enumerate(text.findall(format4b)):
        if len(x)>0:
            if np.isnan(df.loc[n,'month1']) & np.isnan(df.loc[n,'day1']):
                MMM=re.findall(r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec',x[0])[0].lower()
                month=monthtonum(MMM)
                
                day=re.findall(r'\d?\d',x[0])[0]
                
                df.loc[n,'month1']=int(month)
                df.loc[n,'day1']=int(day)
                
                i4b.append(n)
    if verbose:
        print('f4b , {} converted'.format(len(i4b)))

        #    Feb 2009; Sep 2009; Oct 2010
    format5=r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\D+\d{4}'
    i5=[]

    #check if nan in ['date']
    #using pd.isnull(df['date'][n])
    for n,x in enumerate(text.findall(format5)):
        if len(x)>0:
            if np.isnan(df.loc[n,'month1']) & np.isnan(df.loc[n,'year1']):
                date=re.split(r'[ -/]',x[0])
                if len(date)<5:
                    yyyy=int(re.findall(r'\d\d\d\d',x[0])[0])
                    if 2023>yyyy>1000: 
                        MMM=re.findall(r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec',x[0])[0].lower()
                        month=monthtonum(MMM)    
                        year=yyyy

                        df.loc[n,'month1']=int(month)
                        df.loc[n,'year1']=int(year)
                        i4b.append(n)
                
    if verbose:
        print('f5 , {} converted'.format(len(i5)))

    #    
    # dd mm
    format6a=r'\d{1,2}[/-]\d{1,2}'
    i6a=[]
    
    for n,x in enumerate(text.findall(format6a)):
        if len(x)==1:
            i6a.append(n)
            date=re.split(r'[/-]',x[0])
            a=int(date[0])
            b=int(date[1])
            if a==0 or b==0: i6a.remove(n)
            if a>32 or b>32: i6a.remove(n)
            if 13<a<32 and 13<b<32: i6a.remove(n)
            
    for i in i6a:

        date=re.split(r'[/-]',s[i][0])
        a=int(date[0])
        b=int(date[1])
        if a>12: 
            df.loc[n,'month1']=int(b)
            df.loc[n,'day1']=int(a)
        elif b>12: 
            df.loc[n,'month1']=int(a)
            df.loc[n,'day1']=int(b)
        elif a<12 and b<12: 
            df.loc[n,'month1']=int(b)
            df.loc[n,'day1']=int(a)                
            
    if verbose:
        print('f6a , {} converted'.format(len(i6a)))
        
    format6b=r'\d{1,2}[/-]\d{4}'
    i6b=[]
    
    for n,x in enumerate(text.findall(format6b)):
        if len(x)==1:
            date=re.split(r'[/-]',x[0])
            a=int(date[0])
            b=int(date[1])
            if a<12 and 1500<b<2023: 
                df.loc[n,'year1']=int(b)
                df.loc[n,'month1']=int(a)  
                i6b.append(n)            
            
    if verbose:
        print('f6b , {} converted'.format(len(i6b)))
        
    #    2009; 2010
    format7=r'\d{4}'
    i7=[]

    for n,x in enumerate(text.findall(format7)):
        if len(x)==1:
            if pd.isnull(df['year1'][n]):
                if 1500<int(x[0])<2100:
                    df.loc[n,'year1']=int(x[0])
                    i7.append(n)
        if len(x)==2:
            if pd.isnull(df['year1'][n]):
                if 1500<int(x[0])<2100:
                    year=int(x[0])
                    df.loc[n,'year1']=int(x[0])
                    i7.append(n)
            if pd.isnull(df['year2'][n]):
                if 1500<int(x[1])<2100:
                    year=int(x[1])
                    df.loc[n,'year2']=int(x[1])

    if verbose:
        print('f7 , {} converted'.format(len(i7)))        

    return df['year1'],df['year2'],df['month1'],df['month2'],df['day1'],df['day2']# Your answer here

In [7]:


def date_viewer(df_series,verbose=False):
    import pandas as pd
    import re
    import numpy as np
    
    df=pd.DataFrame(df_series)
    text=df_series.str
    if verbose:
        print('text ready')
    #format 1a  yyyy
    #1c ?/?/yy = 1d yy/?/?
    
    format1a=r'\d?\d[-/]\d?\d[-/]\d\d\d\d'
    format1b=r'\d{1,2}[-/]\d{1,2}[/-]\d{2}'

    df['raw']=np.nan
    df['date']=np.nan
    ia=[]
    ib=[]
    
    for n,x in enumerate(text.findall(format1a)):
        if len(x)>0:
            ia.append(x)
            
        
    if verbose:
        print('format1a:?/?/yyyy\n\n')
    
    for n,x in enumerate(text.findall(format1b)):
        if len(x)>0:
            ib.append(x)
        
    if verbose:
        print('format1b:?/?/yy\n\n')

#without th
    format2a=r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?:\D{,3}|\w+)\D{1,2}\d?\d\D{1}\d\d?\d?\d'
    i2a=[]
    for n,x in enumerate(text.findall(format2a)):
        if len(x)>0:
            i2a.append(x)
              
    if verbose:
        print('2a: MMM dd yy yyyy no th \n\n')
        
#with th 
    format2b=r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?:\D{,3}|\w+)\D{1,2}\d?\d(?:\w{2}|\w\w\D)\d\d?\d?\d'
    i2b=[]
    for n,x in enumerate(text.findall(format2b)):
        if len(x)>0:
            i2b.append(x)
              
    if verbose:
        print('2b: MMM dd yy yyyy with th \n\n')

    format3=r'\d?\d\D(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?:\D|\w+\D)\d\d\d\d'
    i3=[]
    for n,x in enumerate(text.findall(format3)):
        if len(x)>0:
            i3.append(x)
              
    if verbose:
        print('3: dd MMM yy yyyy \n')

    #Mar 20th, 2009; Mar 21st, 2009; Mar 22nd, 2009
    format4a=r'(?:Jan|Feb|March|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\D\d\d'
    i4a=[]
    for n,x in enumerate(text.findall(format4a)):
        if len(x)>0:
            i4a.append(x)
              
    if verbose:
        print('4: MMM dd(st) yy yyyy \n')
    

    format4b=r'\d?\d\D(?:Jan|Feb|March|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
    i4b=[]
    for n,x in enumerate(text.findall(format4b)):
        if len(x)>0:
            i4b.append(x)
              
    if verbose:
        print('4: MMM dd(st) yy yyyy \n')

    #    Feb 2009; Sep 2009; Oct 2010
    format5=r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\D+\d{4}'
    i5=[]

    #check if nan in ['date']
    #using pd.isnull(df['date'][n])
    for n,x in enumerate(text.findall(format5)):
        if len(x)>0:
            i5.append(x)
              
    if verbose:
        print('5: MMM yy yyyy \n')

#mm/dd or dd/mm
    format6a=r'\d{1,2}[/-]\d{1,2}'
    i6a=[]

    s=text.findall(format6a)
    for n,x in enumerate(s):
        if len(x)==1:
            i6.append(n)
            date=re.split(r'[/-]',x[0])
            a=int(date[0])
            b=int(date[1])
            if a==0 or b==0: i6.remove(n)
            if a>32 or b>32: i6.remove(n)
            if 13<a<32 and 13<b<32: i6.remove(n)
    i6a=s[i6]
              
    if verbose:
        print('6a: ??/?? \n')
        
    format6b=r'\d{1,2}[/-]\d{4}'
    i6b=[]

    for n,x in enumerate(text.findall(format6b)):
        if len(x)==1:
            i6b.append(x)
              
    if verbose:
        print('6b: mm/yyyy \n')

    #    2009; 2010
    format7=r'\d{4}'
    i7=[]

    for n,x in enumerate(text.findall(format7)):
        if len(x)>0:
            i7.append(x)
              
    if verbose:
        print('7: yyyy \n')
    
    result=[]
    for i in [ia,ib,i2a,i2b,i3,i4a,i4b,i5,i6a,i6b,i7]:
        a=pd.Series(i)
        result.append(a)
        
    return result# Your answer here

#### add date

In [101]:
i6a

74        [8/6]
77      [08/06]
78        [8/6]
92        [8/6]
143     [29-07]
         ...   
6978      [1/2]
6990      [3/4]
7135      [8/6]
7212    [03/08]
7316    [08/02]
Name: text, Length: 73, dtype: object

In [85]:
a,b,c,d,e,f=date_sorter(train.text)

text ready
f1a , 15 converted
f1a , 42 converted
f2a , 17 converted
f2b , 3 converted
f3 , 6 converted
f4a , 7 converted
f4b , 1 converted
f5 , 0 converted
f6a , 73 converted
f6b , 16 converted
f7 , 203 converted


In [4]:
view=date_viewer(train.text)

In [145]:
0#ia
1#ib 有兩個date
2#i2a Mar/w+  no th
3#i2b th
4#i3 split on -
5#i4a 唔要at
6#i4b
7#i5 成個唔要
8#i6a 分開做 dd/mm & mm/yyyy
9#i6b
10#i7 set time range
#add dd/MMM

10

In [80]:
for a in view[10]:
    print(a)

['2013']
['2468']
['1600']
['2015', '2781']
['2010']
['8015']
['2015']
['2015']
['2015']
['2015']
['2015']
['2015']
['1000']
['1960']
['2005']
['5400']
['9306']
['2011']
['2015', '2014']
['1023']
['1023']
['2011']
['2613', '1008']
['1008']
['1402']
['1411']
['1392']
['1441']
['2414']
['1386']
['1434']
['2016']
['6615']
['1998', '1620']
['2005']
['2011']
['2014']
['1128']
['2732', '1236']
['1916']
['4103']
['1907']
['2015']
['1880']
['1880']
['1043']
['1921']
['1979', '2017', '2019']
['2208']
['1997']
['5499']
['2015']
['7986']
['5000']
['5168']
['1969', '9847']
['1862']
['9973']
['1975']
['2015']
['2015']
['2040']
['2008']
['1965']
['2000']
['1980']
['1995']
['8319']
['1200']
['2000']
['2013']
['2015']
['2015']
['2015']
['2015']
['2015']
['7000']
['4900', '1424']
['2011']
['2001']
['2001']
['1943']
['5261']
['1402']
['2015']
['2015']
['1895']
['2015']
['1775']
['2015']
['2009']
['2015']
['1986', '2016']
['2015']
['2015']
['4500']
['4500']
['2015']
['2637']
['2015']
['2015']
['3000']
['

In [62]:
format6b=r'\d{1,2}[/-]\d{4}'
i6b=[]

text=train.text.copy().str
s=text.findall(format6a)
for n,x in enumerate(s):
    if len(x)==1:
        i6.append(n)
        date=re.split(r'[/-]',x[0])
        a=int(date[0])
        b=int(date[1])
        if a==0 or b==0: i6.remove(n)
        if a>32 or b>32: i6.remove(n)
        if 13<a<32 and 13<b<32: i6.remove(n)
        
for i in i6:

    date=re.split(r'[/-]',s[i][0])
    a=int(date[0])
    b=int(date[1])
    if a>12: 
        montha.append(int(b))
        daya.append(int(a))
    elif b>12: 
        monthb.append(int(a))
        dayb.append(int(b))
    elif a<12 and b<12: 
        monthc.append(int(b))
        dayc.append(int(a))


In [49]:
for n,i in enumerate(i6):
    if n<10:
        date=re.split(r'[/-]',s[i][0])
        print(date)
    

['8', '6']
['08', '06']
['8', '6']
['8', '6']
['29', '07']
['29', '07']
['29', '07']
['29', '07']
['29', '07']
['9', '11']


In [59]:
len(i6)

73

In [15]:
df=train.copy()

for n,x in enumerate(view[8]):
    if len(x)>0:
        date=re.split(r'[ -/]',x[0])
        if len(date)<5:
            yyyy=int(re.findall(r'\d\d\d\d',x[0])[0])
            if 2023>yyyy>1000: 
                MMM=re.findall(r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec',x[0])[0].lower()
                month=monthtonum(MMM)    
                year=yyyy

                print(month,year)


11 2013
8 2011
10 1895
9 1986
4 2014
11 2011
12 2013
4 1935
9 2015
12 2011
6 2015
8 2015
7 2015
1 2007
8 2012
