In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from nltk.corpus import stopwords


In [4]:
wn = nltk.WordNetLemmatizer()


In [5]:
dataset=pd.read_csv('/content/tweets.csv')
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [6]:
dataset["tweet_length"]=dataset["tweet"].apply(lambda x:len(x)-x.count(" "))


In [7]:
def punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")),3)*100
dataset['punct%']=dataset["tweet"].apply(lambda x: punct(x))
dataset.head()

Unnamed: 0,id,label,tweet,tweet_length,punct%
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,116,13.8
1,2,0,Finally a transparant silicon case ^^ Thanks t...,115,13.9
2,3,0,We love this! Would you go? #talk #makememorie...,109,16.5
3,4,0,I'm wired I know I'm George I was made that wa...,96,17.7
4,5,1,What amazing service! Apple won't even talk to...,102,4.9


In [8]:
#removing the non alphabets

In [9]:
dataset["tweet"]=dataset["tweet"].str.replace("[^a-z ]","")

In [10]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    stopwords=nltk.corpus.stopwords.words('english')
    text = [word for word in tokens if word not in stopwords]
    return text
# adding new column tweet_nostopwords which consists of tokenized tweets with no stopwords.
dataset['tweet_nostopwords'] = dataset['tweet'].apply(lambda x: clean_text(x.lower()))
dataset.head()

Unnamed: 0,id,label,tweet,tweet_length,punct%,tweet_nostopwords
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,116,13.8,"[fingerprint, pregnancy, test, httpsgooglh1mfq..."
1,2,0,Finally a transparant silicon case ^^ Thanks t...,115,13.9,"[finally, transparant, silicon, case, thanks, ..."
2,3,0,We love this! Would you go? #talk #makememorie...,109,16.5,"[love, would, go, talk, makememories, unplug, ..."
3,4,0,I'm wired I know I'm George I was made that wa...,96,17.7,"[im, wired, know, im, george, made, way, iphon..."
4,5,1,What amazing service! Apple won't even talk to...,102,4.9,"[amazing, service, apple, wont, even, talk, qu..."


In [11]:
import nltk


In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
def lemmatizing(tokenized_text):
    text =" ".join([wn.lemmatize(word) for word in tokenized_text])
    return text

# adding new column tweet_lemmatized which consists of lemmatized tweets.
dataset['tweet_lemmatized'] = dataset['tweet_nostopwords'].apply(lambda x: lemmatizing(x))

dataset.head()

Unnamed: 0,id,label,tweet,tweet_length,punct%,tweet_nostopwords,tweet_lemmatized
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,116,13.8,"[fingerprint, pregnancy, test, httpsgooglh1mfq...",fingerprint pregnancy test httpsgooglh1mfqv an...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,115,13.9,"[finally, transparant, silicon, case, thanks, ...",finally transparant silicon case thanks uncle ...
2,3,0,We love this! Would you go? #talk #makememorie...,109,16.5,"[love, would, go, talk, makememories, unplug, ...",love would go talk makememories unplug relax i...
3,4,0,I'm wired I know I'm George I was made that wa...,96,17.7,"[im, wired, know, im, george, made, way, iphon...",im wired know im george made way iphone cute d...
4,5,1,What amazing service! Apple won't even talk to...,102,4.9,"[amazing, service, apple, wont, even, talk, qu...",amazing service apple wont even talk question ...


In [14]:
dataset.describe()

Unnamed: 0,id,label,tweet_length,punct%
count,7920.0,7920.0,7920.0,7920.0
mean,3960.5,0.255808,113.076263,11.582677
std,2286.451399,0.436342,45.157979,5.023823
min,1.0,0.0,7.0,0.0
25%,1980.75,0.0,90.0,7.6
50%,3960.5,0.0,108.0,12.6
75%,5940.25,1.0,117.0,15.1
max,7920.0,1.0,370.0,40.4


In [15]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 7920 non-null   int64  
 1   label              7920 non-null   int64  
 2   tweet              7920 non-null   object 
 3   tweet_length       7920 non-null   int64  
 4   punct%             7920 non-null   float64
 5   tweet_nostopwords  7920 non-null   object 
 6   tweet_lemmatized   7920 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 433.2+ KB


In [16]:
dataset_train=dataset.iloc[0:7920,:]
dataset_test=dataset.iloc[7920:,:]

In [17]:
rf = RandomForestClassifier(n_jobs=-1,n_estimators=300)

In [18]:


# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()


X_train = vectorizer.fit_transform(dataset_train['tweet_lemmatized'])




rf.fit(X_train, dataset_train['label'])

In [19]:
rf.fit(X_train['tweet_lemmatized'],X_train['label'])

IndexError: Index dimension must be 1 or 2

In [None]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)