In [1]:
import numpy as np
import pandas as pd

In [3]:
train_data = pd.read_csv('train_tweets.csv')

In [4]:
train_data['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [5]:
def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)

In [6]:
import re
## example ## 
re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ","ouch...junior is angryð#got7 #junior #yugyo..., @user")


'ouch   junior is angry     got7  junior  yugyo      '

In [7]:
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())


In [8]:
train_data['processed_tweets'] = train_data['tweet'].apply(process_tweet)

In [9]:
train_data.head(2000)

Unnamed: 0,id,label,tweet,processed_tweets
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i can t use cause they ...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation
5,6,0,[2/2] huge fan fare and big talking before the...,2 2 huge fan fare and big talking before they ...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams can...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land allin cavs champions clev...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here i m it s so gr8


In [10]:
final_result = train_data.head(2000)
final_result.to_csv('processed.csv',index=False)

In [11]:
drop_features(['label','tweet'],train_data)

In [12]:
train_data.head(2000)

Unnamed: 0,id,processed_tweets
0,1,when a father is dysfunctional and is so selfi...
1,2,thanks for lyft credit i can t use cause they ...
2,3,bihday your majesty
3,4,model i love u take with u all the time in ur
4,5,factsguide society now motivation
5,6,2 2 huge fan fare and big talking before they ...
6,7,camping tomorrow danny
7,8,the next school year is the year for exams can...
8,9,we won love the land allin cavs champions clev...
9,10,welcome here i m it s so gr8


In [13]:
final_result = train_data.head(2000)
final_result.to_csv('after_process.csv',index=False)


In [14]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(train_data["processed_tweets"],train_data["id"], test_size = 0.2, random_state = 42)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [18]:
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)


In [19]:
x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)


In [20]:
print(x_train_counts.shape)
print(x_train_tfidf.shape)


(25569, 33735)
(25569, 33735)


In [21]:
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)


In [22]:
print(x_test_counts.shape)
print(x_test_tfidf.shape)


(6393, 33735)
(6393, 33735)


In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=2000)
model.fit(x_train_tfidf,y_train)

In [None]:
predictions = model.predict(x_test_tfidf)

In [None]:
from sklearn.metrics import confusion_matrix,f1_score
confusion_matrix(y_test,predictions)


In [None]:
f1_score(y_test,predictions)

In [26]:
test_data = pd.read_csv('test_tweets.csv')

In [27]:
test_data['processed_tweet'] = test_data['tweet'].apply(process_tweet)

In [28]:
test_data.head(2000)

Unnamed: 0,id,tweet,processed_tweet
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd bihday to my amazing hilarious nephew eli ...
5,31968,choose to be :) #momtips,choose to be momtips
6,31969,something inside me dies ð¦ð¿â¨ eyes nes...,something inside me dies eyes ness smokeyeyes ...
7,31970,#finished#tattoo#inked#ink#loveitâ¤ï¸ #â¤ï¸...,finished tattoo inked ink loveit thanks aleeee
8,31971,@user @user @user i will never understand why...,i will never understand why my dad left me whe...
9,31972,#delicious #food #lovelife #capetown mannaep...,delicious food lovelife capetown mannaepicure ...


In [29]:
final_result = test_data.head()
final_result.to_csv('test_process.csv',index=False)

In [30]:
test_data = pd.read_csv('test_tweets.csv')

In [31]:
test_data['processed_tweet'] = test_data['tweet'].apply(process_tweet)

In [32]:
drop_features(['tweet'],test_data)

In [33]:
test_data.head(2000)

Unnamed: 0,id,processed_tweet
0,31963,studiolife aislife requires passion dedication...
1,31964,white supremacists want everyone to see the ne...
2,31965,safe ways to heal your acne altwaystoheal heal...
3,31966,is the hp and the cursed child book up for res...
4,31967,3rd bihday to my amazing hilarious nephew eli ...
5,31968,choose to be momtips
6,31969,something inside me dies eyes ness smokeyeyes ...
7,31970,finished tattoo inked ink loveit thanks aleeee
8,31971,i will never understand why my dad left me whe...
9,31972,delicious food lovelife capetown mannaepicure ...


In [34]:
final_result = test_data.head()
final_result.to_csv('aftertest_process.csv',index=False)

In [35]:
train_counts = count_vect.fit_transform(train_data['processed_tweets'])
test_counts = count_vect.transform(test_data['processed_tweet'])


In [36]:
print(train_counts.shape)
print(test_counts.shape)


(31962, 38763)
(17197, 38763)


In [37]:
train_tfidf = transformer.fit_transform(train_counts)
test_tfidf = transformer.transform(test_counts)


In [38]:
print(train_tfidf.shape)
print(test_tfidf.shape)


(31962, 38763)
(17197, 38763)
