In [39]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model, model_selection

In [2]:
tweet = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [6]:
print('There are {} rows and {} columns in train'.format(tweet.shape[0],tweet.shape[1]))
print('There are {} rows and {} columns in test'.format(test.shape[0],test.shape[1]))

There are 7613 rows and 5 columns in train
There are 3263 rows and 4 columns in test


In [10]:
df=pd.concat([tweet,test])
tweet.shape
test.shape
df.shape

(10876, 5)

In [11]:
#Removing the urls is a typical first step
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [14]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

In [16]:
#Now to remove HTML tags
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [17]:
df['text']=df['text'].apply(lambda x : remove_html(x))

In [18]:
#Finally to remove emoji's
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [19]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

In [21]:
#Also to remove punctuation
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [24]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

In [25]:
training_df=df[df['target'].notnull()]
oot_df=df[df['target'].isnull()]

In [27]:
training_df.shape
oot_df.shape

(3263, 5)

In [28]:
oot_df = oot_df.drop('target', 1)

In [29]:
oot_df.shape

(3263, 4)

In [34]:
def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = training_df["text"].tolist()
list_labels = training_df["target"].tolist()
oot_corpus = oot_df["text"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                random_state=32)

X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)
oot_df_counts = count_vectorizer.transform(oot_corpus)

In [49]:
X_test_counts

<1523x15411 sparse matrix of type '<class 'numpy.int64'>'
	with 16983 stored elements in Compressed Sparse Row format>

In [37]:
clf = linear_model.RidgeClassifier()

In [40]:
scores = model_selection.cross_val_score(clf, X_train_counts, y_train, cv=3, scoring="f1")
scores

array([0.71378092, 0.7141994 , 0.73176471])

In [41]:
scores = model_selection.cross_val_score(clf, X_test_counts, y_test, cv=3, scoring="f1")
scores

array([0.60714286, 0.62433862, 0.69367089])

In [42]:
clf.fit(X_train_counts, y_train)

RidgeClassifier()

In [43]:
sample_submission = pd.read_csv("data/sample_submission.csv")

In [44]:
sample_submission["target"] = clf.predict(oot_df_counts)

In [45]:
sample_submission.describe()

Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.382164
std,3146.427221,0.485991
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,1.0
max,10875.0,1.0
